@article {2757, title = {The Influence of Computerized Adaptive Testing on Psychometric Theory and Practice}, journal = {Journal of Computerized Adaptive Testing}, volume = {11}, year = {2024}, abstract = {

The major premise of this article is that part of the stimulus for the evolution of psychometric theory since the 1950s was the introduction of the concept of computerized adaptive testing (CAT) or its earlier non-CAT variations. The conceptual underpinnings of CAT that had the most influence on psychometric theory was the shift of emphasis from the test (or test score) as the focus of analysis to the test item (or item score). The change in focus allowed a change in the way that test results are conceived of as measurements. It also resolved the conflict among a number of ideas that were present in the early work on psychometric theory. Some of the conflicting ideas are summarized below to show how work on the development of CAT resolved some of those conflicts.

}, keywords = {computerized adaptive testing, Item Response Theory, paradigm shift, scaling theory, test design}, issn = {2165-6592}, doi = {10.7333/2403-1101001}, url = {https://jcatpub.net/index.php/jcat/issue/view/34/9}, author = {Reckase, Mark D.} } @article {2753, title = {Expanding the Meaning of Adaptive Testing to Enhance Validity}, journal = {Journal of Computerized Adaptive Testing}, volume = {10}, year = {2023}, pages = {22-31}, keywords = {Adaptive Testing, CAT, CBT, test-taking disengagement, validity}, doi = {10.7333/2305-1002022}, author = {Steven L. Wise} } @article {2754, title = {How Do Trait Change Patterns Affect the Performance of Adaptive Measurement of Change?}, journal = {Journal of Computerized Adaptive Testing}, volume = {10}, year = {2023}, pages = {32-58}, keywords = {adaptive measurement of change, computerized adaptive testing, longitudinal measurement, trait change patterns}, doi = {10.7333/2307-1003032}, author = {Ming Him Tai and Allison W. Cooperman and Joseph N. DeWeese and David J. Weiss} } @article {2751, title = {The (non)Impact of Misfitting Items in Computerized Adaptive Testing}, journal = {Journal of Computerized Adaptive Testing}, volume = {9}, year = {2022}, keywords = {computerized adaptive testing, item fit, three-parameter logistic model}, doi = {10.7333/2211-0902008}, url = {https://jcatpub.net/index.php/jcat/issue/view/26}, author = {Christine E. DeMars} } @article {2702, title = {How Adaptive Is an Adaptive Test: Are All Adaptive Tests Adaptive?}, journal = {Journal of Computerized Adaptive Testing}, volume = {7}, year = {2019}, pages = {1-14}, keywords = {computerized adaptive test, multistage test, statistical indicators of amount of adaptation}, doi = {10.7333/1902-0701001}, url = {http://iacat.org/jcat/index.php/jcat/article/view/69/34}, author = {Mark Reckase and Unhee Ju and Sewon Kim} } @article {2717, title = {Time-Efficient Adaptive Measurement of Change}, journal = {Journal of Computerized Adaptive Testing}, volume = {7}, year = {2019}, pages = {15-34}, abstract = {

The adaptive measurement of change (AMC) refers to the use of computerized adaptive testing (CAT) at multiple occasions to efficiently assess a respondent\’s improvement, decline, or sameness from occasion to occasion. Whereas previous AMC research focused on administering the most informative item to a respondent at each stage of testing, the current research proposes the use of Fisher information per time unit as an item selection procedure for AMC. The latter procedure incorporates not only the amount of information provided by a given item but also the expected amount of time required to complete it. In a simulation study, the use of Fisher information per time unit item selection resulted in a lower false positive rate in the majority of conditions studied, and a higher true positive rate in all conditions studied, compared to item selection via Fisher information without accounting for the expected time taken. Future directions of research are suggested.

}, keywords = {adaptive measurement of change, computerized adaptive testing, Fisher information, item selection, response-time modeling}, issn = {2165-6592}, doi = {10.7333/1909-0702015}, url = {http://iacat.org/jcat/index.php/jcat/article/view/73/35}, author = {Matthew Finkelman and Chun Wang} } @conference {2667, title = {Adapting Linear Models for Optimal Test Design to More Complex Test Specifications}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Combinatorial optimization (CO) has proven to be a very helpful approach for addressing test assembly issues and for providing solutions. Furthermore, CO has been applied for several test designs, including: (1) for the development of linear test forms; (2) for computerized adaptive testing and; (3) for multistage testing. In his seminal work, van der Linden (2006) laid out the basis for using linear models for simultaneously assembling exams and item pools in a variety of conditions: (1) for single tests and multiple tests; (2) with item sets, etc. However, for some testing programs, the number and complexity of test specifications can grow rapidly. Consequently, the mathematical representation of the test assembly problem goes beyond most approaches reported either in van der Linden\’s book or in the majority of other publications related to test assembly. In this presentation, we extend van der Linden\’s framework by including the concept of blocks for test specifications. We modify the usual mathematical notation of a test assembly problem by including this concept and we show how it can be applied to various test designs. Finally, we will demonstrate an implementation of this approach in a stand-alone software, called the ATASolver.

Session Video

}, keywords = {Complex Test Specifications, Linear Models, Optimal Test Design}, author = {Maxim Morin} } @conference {2653, title = {Adaptivity in a Diagnostic Educational Test}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

During the past five years a diagnostic educational test for three subjects (writing Dutch, writing English and math) has been developed in the Netherlands. The test informs students and their teachers about the students\’ strengths and weaknesses in such a manner that the learning process can be adjusted to their personal needs. It is a computer-based assessment for students in five different educational tracks midway secondary education that can yield diagnoses of many sub-skills. One of the main challenges at the outset of the development was to devise a way to deliver many diagnoses within a reasonably testing time. The answer to this challenge was to make the DET adaptive.

In this presentation we will discuss first how the adaptivity is shaped towards the purpose of the Diagnostic Educational Test. The adaptive design, particularly working with item blocks, will be discussed as well as the implemented adaptive rules. We will also show a simulation of different adaptive paths of students and some empirical information on the paths students took through the test

Session Video

}, keywords = {CAT, Diagnostic tests, Education}, author = {Sanneke Schouwstra} } @conference {2658, title = {Analysis of CAT Precision Depending on Parameters of the Item Pool}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

The purpose of this research project is to analyze the measurement precision of a latent variable depending on parameters of the item pool. The influence of the following factors is analyzed:

Factor A \– range of variation of items in the pool. This factor varies on three levels with the following ranges in logits: a1 \– [-3.0; +3.0], a2 - [-4.0; +4.0], a3 - [-5.0; +5.0].

Factor B \– number of items in the pool. The factor varies on six levels with the following number of items for every factor: b1 - 128, b2 - 256, b3 \– 512, b4 - 1024, b5 \– 2048, b6 \– 4096. The items are evenly distributed in each of the variation ranges.

Factor C \– examinees\’ proficiency varies at 30 levels (c1, c2, \…, c30), which are evenly distributed in the range [-3.0; +3.0] logit.

The investigation was based on a simulation experiment within the framework of the theory of latent variables.

Response Y is the precision of measurement of examinees\’ proficiency, which is calculated as the difference between the true levels of examinees\’ proficiency and estimates obtained by means of adaptive testing. Three factor ANOVA was used for data processing.

The following results were obtained:

1. Factor A is significant. Ceteris paribus, the greater the range of variation of items in the pool, the higher the estimation precision is.

2. Factor B is significant. Ceteris paribus, the greater the number of items in the pool, the higher the estimation precision is.

3. Factor C is statistically insignificant at level \α = .05. It means that the precision of estimation of examinees\’ proficiency is the same within the range of their variation.

4. The only significant interaction among all interactions is AB. The significance of this interaction is explained by the fact that increasing the number of items in the pool decreases the effect of the range of variation of items in the pool.\ 

Session Video

}, keywords = {CAT, Item parameters, Precision}, url = {https://drive.google.com/file/d/1Bwe58kOQRgCSbB8x6OdZTDK4OIm3LQI3/view?usp=drive_web}, author = {Anatoly Maslak and Stanislav Pozdniakov} } @conference {2664, title = {Bayesian Perspectives on Adaptive Testing}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Although adaptive testing is usually treated from the perspective of maximum-likelihood parameter estimation and maximum-informaton item selection, a Bayesian pespective is more natural, statistically efficient, and computationally tractable. This observation not only holds for the core process of ability estimation but includes such processes as item calibration, and real-time monitoring of item security as well. Key elements of the approach are parametric modeling of each relevant process, updating of the parameter estimates after the arrival of each new response, and optimal design of the next step.

The purpose of the symposium is to illustrates the role of Bayesian statistics in this approach. The first presentation discusses a basic Bayesian algorithm for the sequential update of any parameter in adaptive testing and illustrates the idea of Bayesian optimal design for the two processes of ability estimation and online item calibration. The second presentation generalizes the ideas to the case of 62 IACAT 2017 ABSTRACTS BOOKLET adaptive testing with polytomous items. The third presentation uses the fundamental Bayesian idea of sampling from updated posterior predictive distributions (\“multiple imputations\”) to deal with the problem of scoring incomplete adaptive tests.

Session Video 1

Session Video 2

}, keywords = {Bayesian Perspective, CAT}, author = {Wim J. van der Linden and Bingnan Jiang and Hao Ren and Seung W. Choi and Qi Diao} } @conference {2641, title = {Is CAT Suitable for Automated Speaking Test?}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

We have developed automated scoring system of Japanese speaking proficiency, namely SJ-CAT (Speaking Japanese Computerized Adaptive Test), which is operational for last few months. One of the unique features of the test is an adaptive test base on polytomous IRT.

SJ-CAT consists of two sections; Section 1 has sentence reading aloud tasks and a multiple choicereading tasks and Section 2 has sentence generation tasks and an open answer tasks. In reading aloud tasks, a test taker reads a phoneme-balanced sentence on the screen after listening to a model reading. In a multiple choice-reading task, a test taker sees a picture and reads aloud one sentence among three sentences on the screen, which describe the scene most appropriately. In a sentence generation task, a test taker sees a picture or watches a video clip and describes the scene with his/her own words for about ten seconds. In an open answer tasks, the test taker expresses one\’s support for or opposition to e.g., a nuclear power generation with reasons for about 30 seconds.

In the course of the development of the test, we found many unexpected and unique characteristics of speaking CAT, which are not found in usual CATs with multiple choices. In this presentation, we will discuss some of such factors that are not previously noticed in our previous project of developing dichotomous J-CAT (Japanese Computerized Adaptive Test), which consists of vocabulary, grammar, reading, and listening. Firstly, we will claim that distribution of item difficulty parameters depends on the types of items. An item pool with unrestricted types of items such as open questions is difficult to achieve ideal distributions, either normal distribution or uniform distribution. Secondly, contrary to our expectations, open questions are not necessarily more difficult to operate in automated scoring system than more restricted questions such as sentence reading, as long as if one can set up suitable algorithm for open question scoring. Thirdly, we will show that the speed of convergence of standard deviation of posterior distribution, or standard error of theta parameter in polytomous IRT used for SJCAT is faster than dichotomous IRT used in J-CAT. Fourthly, we will discuss problems in equation of items in SJ-CAT, and suggest introducing deep learning with reinforcement learning instead of equation. And finally, we will discuss the issues of operation of SJ-CAT on the web, including speed of scoring, operation costs, security among others.

Session Video

}, keywords = {Automated Speaking Test, CAT, language testing}, author = {Shingo Imai} } @conference {2637, title = {Comparison of Pretest Item Calibration Methods in a Computerized Adaptive Test (CAT)}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Calibration methods for pretest items in a computerized adaptive test (CAT) are not a new area of research inquiry. After decades of research on CAT, the fixed item parameter calibration (FIPC) method has been widely accepted and used by practitioners to address two CAT calibration issues: (a) a restricted ability range each item is exposed to, and (b) a sparse response data matrix. In FIPC, the parameters of the operational items are fixed at their original values, and multiple expectation maximization (EM) cycles are used to estimate parameters of the pretest items with prior ability distribution being updated multiple times (Ban, Hanson, Wang, Yi, \& Harris, 2001; Kang \& Peterson, 2009; Pommerich \& Segall, 2003).

Another calibration method is the fixed person parameter calibration (FPPC) method proposed by Stocking (1988) as \“Method A.\” Under this approach, candidates\’ ability estimates are fixed in the calibration of pretest items and they define the scale on which the parameter estimates are reported. The logic of FPPC is suitable for CAT applications because the person parameters are estimated based on operational items and available for pretest item calibration. In Stocking (1988), the FPPC was evaluated using the LOGIST computer program developed by Wood, Wingersky, and Lord (1976). He reported that \“Method A\” produced larger root mean square errors (RMSEs) in the middle ability range than \“Method B,\” which required the use of anchor items (administered non-adaptively) and linking steps to attempt to correct for the potential scale drift due to the use of imperfect ability estimates.

Since then, new commercial software tools such as BILOG-MG and flexMIRT (Cai, 2013) have been developed to handle the FPPC method with different implementations (e.g., the MH-RM algorithm with flexMIRT). The performance of the FPPC method with those new software tools, however, has rarely been researched in the literature.

In our study, we evaluated the performance of two pretest item calibration methods using flexMIRT, the new software tool. The FIPC and FPPC are compared under various CAT settings. Each simulated exam contains 75\% operational items and 25\% pretest items, and real item parameters are used to generate the CAT data. This study also addresses the lack of guidelines in existing CAT item calibration literature regarding population ability shift and exam length (more accurate theta estimates are expected in longer exams). Thus, this study also investigates the following four factors and their impact on parameter estimation accuracy, including: (1) candidate population changes (3 ability distributions); (2) exam length (20: 15 OP + 5 PT, 40: 30 OP + 10 PT, and 60: 45 OP + 15 PT); (3) data model fit (3PL and 3PL with fixed C), and (4) pretest item calibration sample sizes (300, 500, and 1000). This study\’s findings will fill the gap in this area of research and thus provide new information on which practitioners can base their decisions when selecting a pretest calibration method for their exams.

References

Ban, J. C., Hanson, B. A., Wang, T., Yi, Q., \& Harris, D. J. (2001). A comparative study of online pretest item\—Calibration/scaling methods in computerized adaptive testing. Journal of Educational Measurement, 38(3), 191\–212.

Cai, L. (2013). flexMIRT\® Flexible Multilevel Multidimensional Item Analysis and Test Scoring (Version 2) [Computer software]. Chapel Hill, NC: Vector Psychometric Group.

Kang, T., \& Petersen, N. S. (2009). Linking item parameters to a base scale (Research Report No. 2009\– 2). Iowa City, IA: ACT.

Pommerich, M., \& Segall, D.O. (2003, April). Calibrating CAT pools and online pretest items using marginal maximum likelihood methods. Paper presented at the annual meeting of the National Council on Measurement in Education, Chicago, IL.

Stocking, M. L. (1988). Scale drift in online calibration (Research Report No. 88\–28). Princeton, NJ: Educational Testing Service.

Wood, R. L., Wingersky, M. S., \& Lord, F. M. (1976). LOGIST: A computer program for estimating examinee ability and item characteristic curve parameters (RM76-6) [Computer program]. Princeton, NJ: Educational Testing Service.

Session Video

}, keywords = {CAT, Pretest Item Calibration}, author = {Huijuan Meng and Chris Han} } @conference {2632, title = {A Comparison of Three Empirical Reliability Estimates for Computerized Adaptive Testing}, booktitle = {IACAT 2017 conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Reliability estimates in Computerized Adaptive Testing (CAT) are derived from estimated thetas and standard error of estimated thetas. In practical, the observed standard error (OSE) of estimated thetas can be estimated by test information function for each examinee with respect to Item response theory (IRT). Unlike classical test theory (CTT), OSEs in IRT are conditional values given each estimated thetas so that those values should be marginalized to consider test reliability. Arithmetic mean, Harmonic mean, and Jensen equality were applied to marginalize OSEs to estimate CAT reliability. Based on different marginalization method, three empirical CAT reliabilities were compared with true reliabilities. Results showed that three empirical CAT reliabilities were underestimated compared to true reliability in short test length (\< 40), whereas the magnitude of CAT reliabilities was followed by Jensen equality, Harmonic mean, and Arithmetic mean in long test length (\> 40). Specifically, Jensen equality overestimated true reliability across all conditions in long test length (\>50).

Session Video\ 

}, keywords = {CAT, Reliability}, url = {https://drive.google.com/file/d/1gXgH-epPIWJiE0LxMHGiCAxZZAwy4dAH/view?usp=sharing}, author = {Dong Gi Seo} } @conference {2655, title = {Computerized Adaptive Testing for Cognitive Diagnosis in Classroom: A Nonparametric Approach}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

In the past decade, CDMs of educational test performance have received increasing attention among educational researchers (for details, see Fu \& Li, 2007, and Rupp, Templin, \& Henson, 2010). CDMs of educational test performance decompose the ability domain of a given test into specific skills, called attributes, each of which an examinee may or may not have mastered. The resulting attribute profile documents the individual\’s strengths and weaknesses within the ability domain. The Cognitive Diagnostic Computerized Adaptive Testing (CD-CAT) has been suggested by researchers as a diagnostic tool for assessment and evaluation (e.g., Cheng \& Chang, 2007; Cheng, 2009; Liu, You, Wang, Ding, \& Chang, 2013; Tatsuoka \& Tatsuoka, 1997). While model-based CD-CAT is relatively well-researched in the context of large-scale assessments, this type of system has not received the same degree of development in small-scale settings, where it would be most useful. The main challenge is that the statistical estimation techniques successfully applied to the parametric CD-CAT require large samples to guarantee the reliable calibration of item parameters and accurate estimation of examinees\’ attribute profiles. In response to the challenge, a nonparametric approach that does not require any parameter calibration, and thus can be used in small educational programs, is proposed. The proposed nonparametric CD-CAT relies on the same principle as the regular CAT algorithm, but uses the nonparametric classification method (Chiu \& Douglas, 2013) to assess and update the student\’s ability state while the test proceeds. Based on a student\’s initial responses, 2 a neighborhood of candidate proficiency classes is identified, and items not characteristic of the chosen proficiency classes are precluded from being chosen next. The response to the next item then allows for an update of the skill profile, and the set of possible proficiency classes is further narrowed. In this manner, the nonparametric CD-CAT cycles through item administration and update stages until the most likely proficiency class has been pinpointed. The simulation results show that the proposed method outperformed the compared parametric CD-CAT algorithms and the differences were significant when the item parameter calibration was not optimal.

References

Cheng, Y. (2009). When cognitive diagnosis meets computerized adaptive testing: CD-CAT. Psychometrika, 74, 619-632.

Cheng, Y., \& Chang, H. (2007). The modified maximum global discrimination index method for cognitive diagnostic CAT. In D. Weiss (Ed.) Proceedings of the 2007 GMAC Computerized Adaptive Testing Conference.

Chiu, C.-Y., \& Douglas, J. A. (2013). A nonparametric approach to cognitive diagnosis by proximity to ideal response patterns. Journal of Classification, 30, 225-250.

Fu, J., \& Li, Y. (2007). An integrative review of cognitively diagnostic psychometric models. Paper presented at the Annual Meeting of the National Council on Measurement in Education. Chicago, Illinois.

Liu, H., You, X., Wang, W., Ding, S., \& Chang, H. (2013). The development of computerized adaptive testing with cognitive diagnosis for an English achievement test in China. Journal of Classification, 30, 152-172.

Rupp, A. A., \& Templin, J. L., \& Henson, R. A. (2010). Diagnostic Measurement. Theory, Methods, and Applications. New York: Guilford.

Tatsuoka, K.K., \& Tatsuoka, M.M. (1997), Computerized cognitive diagnostic adaptive testing: Effect on remedial instruction as empirical validation. Journal of Educational Measurement, 34, 3\–20.

Session Video

}, keywords = {CD-CAT, non-parametric approach}, author = {Yuan-Pei Chang and Chia-Yi Chiu and Rung-Ching Tsai} } @conference {2654, title = {Concerto 5 Open Source CAT Platform: From Code to Nodes}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Concerto 5 is the newest version of the Concerto open source R-based Computer-Adaptive Testing platform, which is currently used in educational testing and in clinical trials. In our quest to make CAT accessible to all, the latest version uses flowchart nodes to connect different elements of a test, so that CAT test creation is an intuitive high-level process that does not require writing code.

A test creator might connect an Info Page node, to a Consent Page node, to a CAT node, to a Feedback node. And after uploading their items, their test is done.

This talk will show the new flowchart interface, and demonstrate the creation of a CAT test from scratch in less than 10 minutes.

Concerto 5 also includes a new Polytomous CAT node, so CATs with Likert items can be easily created in the flowchart interface. This node is currently used in depression and anxiety tests in a clinical trial.

Session Video

}, keywords = {Concerto 5, Open Source CAT}, url = {https://drive.google.com/open?id=11eu1KKILQEoK5c-CYO1P1AiJgiQxX0E0}, author = {David Stillwell} } @conference {2657, title = {Developing a CAT: An Integrated Perspective}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Most resources on computerized adaptive testing (CAT) tend to focus on psychometric aspects such as mathematical formulae for item selection or ability estimation. However, development of a CAT assessment requires a holistic view of project management, financials, content development, product launch and branding, and more. This presentation will develop such a holistic view, which serves several purposes, including providing a framework for validity, estimating costs and ROI, and making better decisions regarding the psychometric aspects.

Thompson and Weiss (2011) presented a 5-step model for developing computerized adaptive tests (CATs). This model will be presented and discussed as the core of this holistic framework, then applied to real-life examples. While most CAT research focuses on developing new quantitative algorithms, this presentation is instead intended to help researchers evaluate and select algorithms that are most appropriate for their needs. It is therefore ideal for practitioners that are familiar with the basics of item response theory and CAT, and wish to explore how they might apply these methodologies to improve their assessments.

Steps include:

1. Feasibility, applicability, and planning studies

2. Develop item bank content or utilize existing bank

3. Pretest and calibrate item bank

4. Determine specifications for final CAT

5. Publish live CAT.

So, for example, Step 1 will contain simulation studies which estimate item bank requirements, which then can be used to determine costs of content development, which in turn can be integrated into an estimated project cost timeline. Such information is vital in determining if the CAT should even be developed in the first place.

References

Thompson, N. A., \& Weiss, D. J. (2011). A Framework for the Development of Computerized Adaptive Tests. Practical Assessment, Research \& Evaluation, 16(1). Retrieved from http://pareonline.net/getvn.asp?v=16\&n=1.

Session Video

}, keywords = {CAT Development, integrated approach}, url = {https://drive.google.com/open?id=1Jv8bpH2zkw5TqSMi03e5JJJ98QtXf-Cv}, author = {Nathan Thompson} } @conference {2668, title = {The Development of a Web-Based CAT in China}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Cognitive ability assessment has been widely used as the recruitment tool in hiring potential employees. Traditional cognitive ability tests have been encountering threats from item-exposures and long time for answering. Especially in China, campus recruitment thinks highly of short answering time and anti-cheating. Beisen, as the biggest native online assessment software provider, developed a web-based CAT for cognitive ability which assessing verbal, quantitative, logical and spatial ability in order to decrease answering times, improve assessment accuracy and reduce threats from cheating and faking in online ability test. The web-based test provides convenient testing for examinees who can access easily to the test via internet just by login the test website at any time and any place through any Internet-enabled devices (e.g., laptops, IPADs, and smart phones).

We designed the CAT following strategies of establishing item bank, setting starting point, item selection, scoring and terminating. Additionally, we pay close attention to administrating the test via web. For the CAT procedures, we employed online calibration for establishing a stable and expanding item bank, and integrated maximum Fisher information, \α-stratified strategy and randomization for item selection and coping with item exposures. Fixed-length and variable-length strategies were combined in terminating the test. For fulfilling the fluid web-based testing, we employed cloud computing techniques and designed each computing process subtly. Distributed computation was used to process scoring which executes EAP and item selecting at high speed. Caching all items to the servers in advance helps shortening the process of loading items to examinees\’ terminal equipment. Horizontally scalable cloud servers function coping with great concurrency. The massive computation in item selecting was conversed to searching items from an information matrix table.

We examined the average accuracy, bank usage and computing performance in the condition of laboratory and real testing. According to a test for almost 28000 examinees, we found that bank usage is averagely 50\%, and that 80\% tests terminate at test information of 10 and averagely at 9.6. In context of great concurrency, the testing is unhindered and the process of scoring and item selection only takes averagely 0.23s for each examiner.

Session Video

}, keywords = {China, Web-Based CAT}, author = {Chongli Liang and Danjun Wang and Dan Zhou and Peida Zhan} } @conference {2656, title = {Efficiency of Item Selection in CD-CAT Based on Conjunctive Bayesian Network Modeling Hierarchical attributes}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Cognitive diagnosis models (CDM) aim to diagnosis examinee\’s mastery status of multiple fine-grained skills. As new development for cognitive diagnosis methods emerges, much attention is given to cognitive diagnostic computerized adaptive testing (CD-CAT) as well. The topics such as item selection methods, item exposure control strategies, and online calibration methods, which have been wellstudied for traditional item response theory (IRT) based CAT, are also investigated in the context of CD-CAT (e.g., Xu, Chang, \& Douglas, 2003; Wang, Chang, \& Huebner, 2011; Chen et al., 2012).

In CDM framework, some researchers suggest to model structural relationship between cognitive skills, or namely, attributes. Especially, attributes can be hierarchical, such that some attributes must be acquired before the subsequent ones are mastered. For example, in mathematics, addition must be mastered before multiplication, which gives a hierarchy model for addition skill and multiplication skill. Recently, new CDMs considering attribute hierarchies have been suggested including the Attribute Hierarchy Method (AHM; Leighton, Gierl, \& Hunka, 2004) and the Hierarchical Diagnostic Classification Models (HDCM; Templin \& Bradshaw, 2014).

Bayesian Networks (BN), the probabilistic graphical models representing the relationship of a set of random variables using a directed acyclic graph with conditional probability distributions, also provide an efficient framework for modeling the relationship between attributes (Culbertson, 2016). Among various BNs, conjunctive Bayesian network (CBN; Beerenwinkel, Eriksson, \& Sturmfels, 2007) is a special kind of BN, which assumes partial ordering between occurrences of events and conjunctive constraints between them.

In this study, we propose using CBN for modeling attribute hierarchies and discuss the advantage of CBN for CDM. We then explore the impact of the CBN modeling on the efficiency of item selection methods for CD-CAT when the attributes are truly hierarchical. To this end, two simulation studies, one for fixed-length CAT and another for variable-length CAT, are conducted. For each studies, two attribute hierarchy structures with 5 and 8 attributes are assumed. Among the various item selection methods developed for CD-CAT, six algorithms are considered: posterior-weighted Kullback-Leibler index (PWKL; Cheng, 2009), the modified PWKL index (MPWKL; Kaplan, de la Torre, Barrada, 2015), Shannon entropy (SHE; Tatsuoka, 2002), mutual information (MI; Wang, 2013), posterior-weighted CDM discrimination index (PWCDI; Zheng \& Chang, 2016) and posterior-weighted attribute-level CDM discrimination index (PWACDI; Zheng \& Chang, 2016). The impact of Q-matrix structure, item quality, and test termination rules on the efficiency of item selection algorithms is also investigated. Evaluation measures include the attribute classification accuracy (fixed-length experiment) and test length of CDCAT until stopping (variable-length experiment).

The results of the study indicate that the efficiency of item selection is improved by directly modeling the attribute hierarchies using CBN. The test length until achieving diagnosis probability threshold was reduced to 50-70\% for CBN based CAT compared to the CD-CAT assuming independence of attributes. The magnitude of improvement is greater when the cognitive model of the test includes more attributes and when the test length is shorter. We conclude by discussing how Q-matrix structure, item quality, and test termination rules affect the efficiency.

References

Beerenwinkel, N., Eriksson, N., \& Sturmfels, B. (2007). Conjunctive bayesian networks. Bernoulli, 893- 909.

Chen, P., Xin, T., Wang, C., \& Chang, H. H. (2012). Online calibration methods for the DINA model with independent attributes in CD-CAT. Psychometrika, 77(2), 201-222.

Cheng, Y. (2009). When cognitive diagnosis meets computerized adaptive testing: CD-CAT. Psychometrika, 74(4), 619-632.

Culbertson, M. J. (2016). Bayesian networks in educational assessment: the state of the field. Applied Psychological Measurement, 40(1), 3-21.

Kaplan, M., de la Torre, J., \& Barrada, J. R. (2015). New item selection methods for cognitive diagnosis computerized adaptive testing. Applied Psychological Measurement, 39(3), 167-188.

Leighton, J. P., Gierl, M. J., \& Hunka, S. M. (2004). The attribute hierarchy method for cognitive assessment: a variation on Tatsuoka\&$\#$39;s rule-space approach. Journal of Educational Measurement, 41(3), 205-237.

Tatsuoka, C. (2002). Data analytic methods for latent partially ordered classification models. Journal of the Royal Statistical Society: Series C (Applied Statistics), 51(3), 337-350.

Templin, J., \& Bradshaw, L. (2014). Hierarchical diagnostic classification models: A family of models for estimating and testing attribute hierarchies. Psychometrika, 79(2), 317-339. Wang, C. (2013). Mutual information item selection method in cognitive diagnostic computerized adaptive testing with short test length. Educational and Psychological Measurement, 73(6), 1017-1035.

Wang, C., Chang, H. H., \& Huebner, A. (2011). Restrictive stochastic item selection methods in cognitive diagnostic computerized adaptive testing. Journal of Educational Measurement, 48(3), 255-273.

Xu, X., Chang, H., \& Douglas, J. (2003, April). A simulation study to compare CAT strategies for cognitive diagnosis. Paper presented at the annual meeting of National Council on Measurement in Education, Chicago.

Zheng, C., \& Chang, H. H. (2016). High-efficiency response distribution\–based item selection algorithms for short-length cognitive diagnostic computerized adaptive testing. Applied Psychological Measurement, 40(8), 608-624.

Session Video

}, keywords = {CD-CAT, Conjuctive Bayesian Network Modeling, item selection}, url = {https://drive.google.com/open?id=1RbO2gd4aULqsSgRi_VZudNN_edX82NeD}, author = {Soo-Yun Han and Yun Joo Yoo} } @conference {2652, title = {Efficiency of Targeted Multistage Calibration Designs under Practical Constraints: A Simulation Study}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Calibration of an item bank for computer adaptive testing requires substantial resources. In this study, we focused on two related research questions. First, we investigated whether the efficiency of item calibration under the Rasch model could be enhanced by calibration designs that optimize the match between item difficulty and student ability (Berger, 1991). Therefore, we introduced targeted multistage calibration designs, a design type that refers to a combination of traditional targeted calibration designs and multistage designs. As such, targeted multistage calibration designs consider ability-related background variables (e.g., grade in school), as well as performance (i.e., outcome of a preceding test stage) for assigning students to suitable items.

Second, we explored how limited a priori knowledge about item difficulty affects the efficiency of both targeted calibration designs and targeted multistage calibration designs. When arranging items within a given calibration design, test developers need to know the item difficulties to locate items optimally within the design. However, usually, no empirical information about item difficulty is available before item calibration. Owing to missing empirical data, test developers might fail to assign all items to the most suitable location within a calibration design.

Both research questions were addressed in a simulation study in which we varied the calibration design, as well as the accuracy of item distribution across the different booklets or modules within each design (i.e., number of misplaced items). The results indicated that targeted multistage calibration designs were more efficient than ordinary targeted designs under optimal conditions. Especially, targeted multistage calibration designs provided more accurate estimates for very easy and 52 IACAT 2017 ABSTRACTS BOOKLET very difficult items. Limited knowledge about item difficulty during test construction impaired the efficiency of all designs. The loss of efficiency was considerably large for one of the two investigated targeted multistage calibration designs, whereas targeted designs were more robust.

References

Berger, M. P. F. (1991). On the efficiency of IRT models when applied to different sampling designs. Applied Psychological Measurement, 15(3), 293\–306. doi:10.1177/014662169101500310

Session Video

}, keywords = {CAT, Efficiency, Multistage Calibration}, url = {https://drive.google.com/file/d/1ko2LuiARKqsjL_6aupO4Pj9zgk6p_xhd/view?usp=sharing}, author = {Stephanie Berger and Angela J. Verschoor and Theo Eggen and Urs Moser} } @conference {2635, title = {Evaluation of Parameter Recovery, Drift, and DIF with CAT Data}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Parameter drift and differential item functioning (DIF) analyses are frequent components of a test maintenance plan. That is, after a test form(s) is published, organizations will often calibrate postpublishing data at a later date to evaluate whether the performance of the items or the test has changed over time. For example, if item content is leaked, the items might gradually become easier over time, and item statistics or parameters can reflect this.

When tests are published under a computerized adaptive testing (CAT) paradigm, they are nearly always calibrated with item response theory (IRT). IRT calibrations assume that range restriction is not an issue \– that is, each item is administered to a range of examinee ability. CAT data violates this assumption. However, some organizations still wish to evaluate continuing performance of the items from a DIF or drift paradigm.

This presentation will evaluate just how inaccurate DIF and drift analyses might be on CAT data, using a Monte Carlo parameter recovery methodology. Known item parameters will be used to generate both linear and CAT data sets, which are then calibrated for DIF and drift. In addition, we will implement Randomesque item exposure constraints in some CAT conditions, as this randomization directly alleviates the range restriction problem somewhat, but it is an empirical question as to whether this improves the parameter recovery calibrations.

Session Video

}, keywords = {CAT, DIF, Parameter Drift, Parameter Recovery}, url = {https://drive.google.com/open?id=1F7HCZWD28Q97sCKFIJB0Yps0H66NPeKq}, author = {Nathan Thompson and Jordan Stoeger} } @conference {2663, title = {From Blueprints to Systems: An Integrated Approach to Adaptive Testing}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

For years, test blueprints have told test developers how many items and what types of items will be included in a test. Adaptive testing adopted this approach from paper testing, and it is reasonably useful. Unfortunately, \&$\#$39;how many items and what types of items\&$\#$39; are not all the elements one should consider when choosing items for an adaptive test. To fill in gaps, practitioners have developed tools to allow an adaptive test to behave appropriately (i.e. examining exposure control, content balancing, item drift procedures, etc.). Each of these tools involves the use of a separate process external to the primary item selection process.

The use of these subsidiary processes makes item selection less optimal and makes it difficult to prioritize aspects of selection. This discussion describes systems-based adaptive testing. This approach uses metadata concerning items, test takers and test elements to select items. These elements are weighted by the stakeholders to shape an expanded blueprint designed for adaptive testing.\ 

Session Video

}, keywords = {CAT, integrated approach, Keynote}, url = {https://drive.google.com/open?id=1CBaAfH4ES7XivmvrMjPeKyFCsFZOpQMJ}, author = {Gage Kingsbury and Tony Zara} } @conference {2627, title = {How Adaptive is an Adaptive Test: Are all Adaptive Tests Adaptive?}, booktitle = {2017 IACAT Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

There are many different kinds of adaptive tests but they all have the characteristic that some feature of the test is customized to the purpose of the test. In the time allotted, it is impossible to consider the adaptation of all of this types so this address will focus on the \“classic\” adaptive test that matches the difficulty of the test to the capabilities of the person being tested. This address will first present information on the maximum level of adaptation that can occur and then compare the amount of adaptation that typically occurs on an operational adaptive test to the maximum level of adaptation. An index is proposed to summarize the amount of adaptation and it is argued that this type of index should be reported for operational adaptive tests to show the amount of adaptation that typically occurs.

Click for Presentation Video\ 

}, keywords = {Adaptive Testing, CAT}, url = {https://drive.google.com/open?id=1Nj-zDCKk3DvHA4Jlp1qkb2XovmHeQfxu}, author = {Mark D Reckase} } @conference {2672, title = {An Imputation Approach to Handling Incomplete Computerized Tests}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

As technology advances, computerized adaptive testing (CAT) is becoming increasingly popular as it allows tests to be tailored to an examinee\’s ability.\  Nevertheless, examinees might devise testing strategies to use CAT to their advantage.\  For instance, if only the items that examinees answer count towards their score, then a higher theta score might be obtained by spending more time on items at the beginning of the test and skipping items at the end if time runs out. This type of gaming can be discouraged if examinees\’ scores are lowered or \“penalized\” based on the amount of non-response.

The goal of this study was to devise a penalty function that would meet two criteria: 1) the greater the omit rate, the greater the penalty, and 2) examinees with the same ability and the same omit rate should receive the same penalty. To create the penalty, theta was calculated based on only the items the examinee responded to ( ).\  Next, the expected number correct score (EXR) was obtained using \ and the test characteristic curve. A penalized expected number correct score (E ) was obtained by multiplying EXR by the proportion of items the examinee responded to. Finally, the penalized theta ( ) was identified using the test characteristic curve. Based on\  \ and the item parameters ( ) of an unanswered item, the likelihood of a correct response, \ , is computed and employed to estimate the imputed score ( ) for the unanswered item.

Two datasets were used to generate tests with completion rates of 50\%, 80\%, and 90\%.\  The first dataset included real data where approximately 4,500 examinees responded to a 21 -item test which provided a baseline/truth. Sampling was done to achieve the three completion rate conditions. The second dataset consisted of simulated item scores for 50,000 simulees under a 1-2-4 multi-stage CAT design where each stage contained seven items. Imputed item scores for unanswered items were computed using a variety of values for G (and therefore T).\  Three other approaches to handling unanswered items were also considered: all correct (i.e., T = 0), all incorrect (i.e., T = 1), and random scoring (i.e., T = 0.5).

The current study investigated the impact on theta estimates resulting from the proposed approach to handling unanswered items in a fixed-length CAT. In real testing situations, when examinees do not finish a test, it is hard to tell whether they tried diligently but ran out of time or whether they attempted to manipulate the scoring engine.\  To handle unfinished tests with penalties, the proposed approach considers examinees\’ abilities and incompletion rates. The results of this study provide direction for psychometric practitioners when considering penalties for omitted responses.

Session Video

}, keywords = {CAT, imputation approach, incomplete computerized test}, url = {https://drive.google.com/open?id=1vznZeO3nsZZK0k6_oyw5c9ZTP8uyGnXh}, author = {Troy Chen and Chi-Yu Huang and Chunyan Liu} } @conference {2634, title = {Issues in Trait Range Coverage for Patient Reported Outcome Measure CATs - Extending the Ceiling for Above-average Physical Functioning}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

The use of a measure which fails to cover the upper range of functioning may produce results which can lead to serious misinterpretation. Scores produced by such a measure may fail to recognize significant improvement, or may not be able to demonstrate functioning commensurate with an important milestone. Accurate measurement of this range is critical for the assessment of physically active adults, e.g., athletes recovering from injury and active military personnel who wish to return to active service. Alternatively, a PF measure with a low ceiling might fail to differentiate patients in rehabilitation who continue to improve, but for whom their score ceilings due to the measurement used.

The assessment of physical function (PF) has greatly benefited from modern psychometric theory and resulting scales, such as the Patient-Reported Outcomes Measurement Information System (PROMIS\®) PF instruments. While PROMIS PF has extended the range of function upwards relative to older \“legacy\” instruments, few PROMIS PF items asses high levels of function. We report here on the development of higher functioning items for the PROMIS PF bank.

An expert panel representing orthopedics, sports/military medicine, and rehabilitation reviewed existing instruments and wrote new items. After internal review, cognitive interviews were conducted with 24 individuals of average and high levels of physical function. The remaining candidate items were administered along with 50 existing PROMIS anchor items to an internet panel screened for low, average, and high levels of physical function (N = 1,600), as well as members of Boston-area gyms (N= 344). The resulting data was subjected to standard psychometric analysis, along with multiple linking methods to place the new items on the existing PF metric. The new items were added to the full PF bank for simulated computerized adaptive testing (CAT).

Item response data was collected on 54 candidate items. Items that exhibited local dependence (LD) or differential item functioning (DIF) related to gender, age, race, education, or PF status. These items were removed from consideration. Of the 50 existing PROMIS PF items, 31 were free of DIF and LD and used as anchors. The parameters for the remaining new candidate items were estimated twice: freelyestimated and linked with coefficients and fixed-anchor calibration. Both methods were comparable and had appropriate fit. The new items were added to the full PF bank for simulated CATs. The resulting CAT was able to extend the ceiling with high precision to a T-score of 68, suggesting accurate measurement for 97\% of the general population.

Extending the range of items by which PF is measured will substantially improve measurement quality, applicability, and efficiency. The bank has incorporated these extension items and is available for use in research and clinics for brief CAT administration (see www.healthmeasures.net). Future research projects should focus on recovery trajectories of the measure for individuals with above average function who are recovering from injury.

Session Video

}, keywords = {CAT, Issues, Patient Reported Outcome}, url = {https://drive.google.com/open?id=1ZC02F-dIyYovEjzpeuRdoXDiXMLFRuKb}, author = {Richard C. Gershon} } @conference {2646, title = {Item Pool Design and Evaluation}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Early work on CAT tended to use existing sets of items which came from fixed length test forms. These sets of items were selected to meet much different requirements than are needed for a CAT; decision making or covering a content domain. However, there was also some early work that suggested having items equally distributed over the range of proficiency that was of interest or concentrated at a decision point. There was also some work that showed that there was bias in proficiency estimates when an item pool was too easy or too hard. These early findings eventually led to work on item pool design and, more recently, on item pool evaluation. This presentation gives a brief overview of these topics to give some context for the following presentations in this symposium.

Session Video

}, keywords = {CAT, Item Pool Design}, url = {https://drive.google.com/open?id=1ZAsqm1yNZlliqxEHcyyqQ_vOSu20xxZs}, author = {Mark D Reckase and Wei He and Jing-Ru Xu and Xuechun Zhou} } @conference {2662, title = {Item Response Time on Task Effect in CAT}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Introduction. In addition to reduced test length and increased measurement efficiency, computerized adaptive testing (CAT) can provide new insights into the cognitive process of task completion that cannot be mined via conventional tests. Response time is a primary characteristic of the task completion procedure. It has the potential to inform us about underlying processes. In this study, the relationship between response time and response accuracy will be investigated.

Hypothesis. The present study argues that the relationship between response time on task and response accuracy, which may be positive, negative, or curvilinear, will depend on cognitive nature of task items, holding ability of the subjects and difficulty of the items constant. The interpretations regarding the associations are not uniform either.

Research question. Is there a homogeneous effect of response time on test outcome across Graduate

Proposed explanations. If the accuracy of cognitive test responses decreases with response time, then it is an indication that the underlying cognitive process is a degrading process such as knowledge retrieval. More accessible knowledge can be retrieved faster than less accessible knowledge. It is inherent to knowledge retrieval that the success rate declines with elapsing response time. For instance, in reading tasks, the time on task effect is negative and the more negative, the easier a task is. However, if the accuracy of cognitive test responses increases with response time, then the process is of an upgrading nature, with an increasing success rate as a function of response time. For example, problem-solving takes time, and fast responses are less likely to be well-founded responses. It is of course also possible that the relationship is curvilinear, as when an increasing success rate is followed by a decreasing success rate or vice versa.

Methodology. The data are from computer-based GRE quantitative and verbal tests and will be analyzed with generalized linear mixed models (GLMM) framework after controlling the effect of ability and item difficulty as possible confounding factors. A linear model means a linear combination of predictors determining the probability of person p for answering item i correctly. The models are equivalent with advanced IRT models that go beyond the regular modeling of test responses in terms of one or more latent variables and item parameters. The lme4 package for R will be utilized to conduct the statistical calculation.

Implications. The right amount of testing time in CAT is important\—too much is wasteful and costly, too little impacts score validity. The study is expected to provide new perception on the relationship between response time and response accuracy, which in turn, contribute to a better understanding of time effects and relevant cognitive process in CA.

Session Video

}, keywords = {CAT, Response time, Task effect}, author = {Yang Shi} } @conference {2629, title = {Item Selection Strategies for Developing CAT in Indonesia}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niiagata Seiryo University}, organization = {Niiagata Seiryo University}, address = {Niigata Japan}, abstract = {

Recently, development of computerized testing in Indonesia is quiet promising for the future. Many government institutions used the technology for recruitment. Starting from Indonesian Army acknowledged the benefits of computerized adaptive testing (CAT) over conventional test administration, ones of the issues of selection the first item have taken place of attention. Due to CAT\’s basic philosophy, several methods can be used to select the first item such as educational level, ability estimation from item simulation, or other methods. In this case, the question is remains how apply the methods most effective in the context of constrained adaptive testing. This paper reviews such strategies that appeared in the relevant literature. The focus of this paper is on studies that have been conducted in order to evaluate the effectiveness of item selection strategies for dichotomous scoring. In this paper, also discusses the strength and weaknesses of each strategy group using examples from simulation studies. No new research is presented but rather a compendium of models is reviewed in term of learning in the newcomer context, a wide view of first item selection strategies.

}, keywords = {CAT, Indonesia, item selection strategies}, url = {https://www.youtube.com/watch?v=2KuFrRATq9Q}, author = {Istiani Chandra} } @conference {2643, title = {A Large-Scale Progress Monitoring Application with Computerized Adaptive Testing}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Many conventional assessment tools are available to teachers in schools for monitoring student progress in a formative manner. The outcomes of these assessment tools are essential to teachers\’ instructional modifications and schools\’ data-driven educational strategies, such as using remedial activities and planning instructional interventions for students with learning difficulties. When measuring student progress toward instructional goals or outcomes, assessments should be not only considerably precise but also sensitive to individual change in learning. Unlike conventional paper-pencil assessments that are usually not appropriate for every student, computerized adaptive tests (CATs) are highly capable of estimating growth consistently with minimum and consistent error. Therefore, CATs can be used as a progress monitoring tool in measuring student growth.

This study focuses on an operational CAT assessment that has been used for measuring student growth in reading during the academic school year. The sample of this study consists of nearly 7 million students from the 1st grade to the 12th grade in the US. The students received a CAT-based reading assessment periodically during the school year. The purpose of these periodical assessments is to measure the growth in students\’ reading achievement and identify the students who may need additional instructional support (e.g., academic interventions). Using real data, this study aims to address the following research questions: (1) How many CAT administrations are necessary to make psychometrically sound decisions about the need for instructional changes in the classroom or when to provide academic interventions?; (2) What is the ideal amount of time between CAT administrations to capture student growth for the purpose of producing meaningful decisions from assessment results?

To address these research questions, we first used the Theil-Sen estimator for robustly fitting a regression line to each student\’s test scores obtained from a series of CAT administrations. Next, we\ used the conditional standard error of measurement (cSEM) from the CAT administrations to create an error band around the Theil-Sen slope (i.e., student growth rate). This process resulted in the normative slope values across all the grade levels. The optimal number of CAT administrations was established from grade-level regression results. The amount of time needed for progress monitoring was determined by calculating the amount of time required for a student to show growth beyond the median cSEM value for each grade level. The results showed that the normative slope values were the highest for lower grades and declined steadily as grade level increased. The results also suggested that the CAT-based reading assessment is most useful for grades 1 through 4, since most struggling readers requiring an intervention appear to be within this grade range. Because CAT yielded very similar cSEM values across administrations, the amount of error in the progress monitoring decisions did not seem to depend on the number of CAT administrations.

Session Video

}, keywords = {CAT, Large-Scale tests, Process monitoring}, url = {https://drive.google.com/open?id=1uGbCKenRLnqTxImX1fZicR2c7GRV6Udc}, author = {Okan Bulut and Damien Cormier} } @article {2529, title = {Latent-Class-Based Item Selection for Computerized Adaptive Progress Tests}, journal = {Journal of Computerized Adaptive Testing}, volume = {5}, year = {2017}, pages = {22-43}, keywords = {computerized adaptive progress test, item selection method, Kullback-Leibler information, Latent class analysis, log-odds scoring}, issn = {2165-6592}, doi = {10.7333/1704-0502022}, url = {http://iacat.org/jcat/index.php/jcat/article/view/62/29}, author = {van Buuren, Nikky and Eggen, Theo J. H. M.} } @conference {2648, title = {New Challenges (With Solutions) and Innovative Applications of CAT}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Over the past several decades, computerized adaptive testing (CAT) has profoundly changed the administration of large-scale aptitude tests, state-wide achievement tests, professional licensure exams, and health outcome measures. While many challenges of CAT have been successfully addressed due to the continual efforts of researchers in the field, there are still many remaining, longstanding challenges that have yet to be resolved. This symposium will begin with three presentations, each of which provides a sound solution to one of the unresolved challenges. They are (1) item calibration when responses are \“missing not at random\” from CAT administration; (2) online calibration of new items when person traits have non-ignorable measurement error; (3) establishing consistency and asymptotic normality of latent trait estimation when allowing item response revision in CAT. In addition, this symposium also features innovative applications of CAT. In particular, there is emerging interest in using cognitive diagnostic CAT to monitor and detect learning progress (4th presentation). Last but not least, the 5th presentation illustrates the power of multidimensional polytomous CAT that permits rapid identification of hospitalized patients\’ rehabilitative care needs in\ health outcomes measurement. We believe this symposium covers a wide range of interesting and important topics in CAT.

Session Video

}, keywords = {CAT, challenges, innovative applications}, url = {https://drive.google.com/open?id=1Wvgxw7in_QCq_F7kzID6zCZuVXWcFDPa}, author = {Chun Wang and David J. Weiss and Xue Zhang and Jian Tao and Yinhong He and Ping Chen and Shiyu Wang and Susu Zhang and Haiyan Lin and Xiaohong Gao and Hua-Hua Chang and Zhuoran Shang} } @conference {2638, title = {A New Cognitive Diagnostic Computerized Adaptive Testing for Simultaneously Diagnosing Skills and Misconceptions}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

In education diagnoses, diagnosing misconceptions is important as well as diagnosing skills. However, traditional cognitive diagnostic computerized adaptive testing (CD-CAT) is usually developed to diagnose skills. This study aims to propose a new CD-CAT that can simultaneously diagnose skills and misconceptions. The proposed CD-CAT is based on a recently published new CDM, called the simultaneously identifying skills and misconceptions (SISM) model (Kuo, Chen, \& de la Torre, in press). A new item selection algorithm is also proposed in the proposed CD-CAT for achieving high adaptive testing performance. In simulation studies, we compare our new item selection algorithm with three existing item selection methods, including the Kullback\–Leibler (KL) and posterior-weighted KL (PWKL) proposed by Cheng (2009) and the modified PWKL (MPWKL) proposed by Kaplan, de la Torre, and Barrada (2015). The results show that our proposed CD-CAT can efficiently diagnose skills and misconceptions; the accuracy of our new item selection algorithm is close to the MPWKL but less computational burden; and our new item selection algorithm outperforms the KL and PWKL methods on diagnosing skills and misconceptions.

References

Cheng, Y. (2009). When cognitive diagnosis meets computerized adaptive testing: CD-CAT. Psychometrika, 74(4), 619\–632. doi: 10.1007/s11336-009-9123-2

Kaplan, M., de la Torre, J., \& Barrada, J. R. (2015). New item selection methods for cognitive diagnosis computerized adaptive testing. Applied Psychological Measurement, 39(3), 167\–188. doi:10.1177/0146621614554650

Kuo, B.-C., Chen, C.-H., \& de la Torre, J. (in press). A cognitive diagnosis model for identifying coexisting skills and misconceptions. Applied Psychological Measurement.

Session Video

}, keywords = {CD-CAT, Misconceptions, Simultaneous diagnosis}, author = {Bor-Chen Kuo and Chun-Hua Chen} } @conference {2636, title = {New Results on Bias in Estimates due to Discontinue Rules in Intelligence Testing}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

The presentation provides new results on a form of adaptive testing that is used frequently in intelligence testing. In these tests, items are presented in order of increasing difficulty, and the presentation of items is adaptive in the sense that each subtest session is discontinued once a test taker produces a certain number of incorrect responses in sequence. The subsequent (not observed) responses are commonly scored as wrong for that subtest, even though the test taker has not seen these. Discontinuation rules allow a certain form of adaptiveness both in paper-based and computerbased testing, and help reducing testing time.

Two lines of research that are relevant are studies that directly assess the impact of discontinuation rules, and studies that more broadly look at the impact of scoring rules on test results with a large number of not administered or not reached items. He \& Wolf (2012) compared different ability estimation methods for this type of discontinuation rule adaptation of test length in a simulation study. However, to our knowledge there has been no rigorous analytical study of the underlying distributional changes of the response variables under discontinuation rules. It is important to point out that the results obtained by He \& Wolf (2012) agree with results presented by, for example, DeAyala, Plake \& Impara (2001) as well as Rose, von Davier \& Xu (2010) and Rose, von Davier \& Nagengast (2016) in that ability estimates are biased most when scoring the not observed responses as wrong. Discontinuation rules combined with scoring the non-administered items as wrong is used operationally in several major intelligence tests, so more research is needed in order to improve this particular type of adaptiveness in the testing practice.

The presentation extends existing research on adaptiveness by discontinue-rules in intelligence tests in multiple ways: First, a rigorous analytical study of the distributional properties of discontinue-rule scored items is presented. Second, an extended simulation is presented that includes additional alternative scoring rules as well as bias-corrected ability estimators that may be suitable to improve results for discontinue-rule scored intelligence tests.

References: DeAyala, R. J., Plake, B. S., \& Impara, J. C. (2001). The impact of omitted responses on the accuracy of ability estimation in item response theory. Journal of Educational Measurement, 38, 213-234.

He, W. \& Wolfe, E. W. (2012). Treatment of Not-Administered Items on Individually Administered Intelligence Tests. Educational and Psychological Measurement, Vol 72, Issue 5, pp. 808 \– 826. DOI: 10.1177/0013164412441937

Rose, N., von Davier, M., \& Xu, X. (2010). Modeling non-ignorable missing data with item response theory (IRT; ETS RR-10-11). Princeton, NJ: Educational Testing Service.

Rose, N., von Davier, M., \& Nagengast, B. (2016) Modeling omitted and not-reached items in irt models. Psychometrika. doi:10.1007/s11336-016-9544-7

Session Video

}, keywords = {Bias, CAT, Intelligence Testing}, author = {Matthias von Davier and Youngmi Cho and Tianshu Pan} } @conference {2670, title = {Response Time and Response Accuracy in Computerized Adaptive Testing}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Introduction. This study explores the relationship between response speed and response accuracy in Computerized Adaptive Testing (CAT). CAT provides a score as well as item response times, which can offer additional diagnostic information regarding behavioral processes of task completion that cannot be uncovered by paper-based instruments. The goal of this study is to investigate how the accuracy rate evolves as a function of response time. If the accuracy of cognitive test responses decreases with response time, then it is an indication that the underlying cognitive process is a degrading process such as knowledge retrieval. More accessible knowledge can be retrieved faster than less accessible knowledge. For instance, in reading tasks, the time on task effect is negative and the more negative, the easier a task is. However, if the accuracy of cognitive test responses increases with response time, then the process is of an upgrading nature, with an increasing success rate as a function of response time. For example, problem-solving takes time, and fast responses are less likely to be well-founded responses. It is of course also possible that the relationship is curvilinear, as when an increasing success rate is followed by a decreasing success rate or vice versa.

Hypothesis. The present study argues the relationship between response time on task and response accuracy can be positive, negative, or curvilinear, which depends on cognitive nature of task items holding ability of the subjects and difficulty of the items constant.

Methodology. Data from a subsection of GRE quantitative test were available. We will use generalized linear mixed models. A linear model means a linear combination of predictors determining the probability of person p for answering item i correctly. Modeling mixed effects means both random effects and fixed effects are included. Fixed effects refer to constants across test takers. The models are equivalent with advanced IRT models that go beyond the regular modeling of test responses in terms of one or more latent variables and item parameters. The lme4 package for R will be utilized to conduct the statistical calculation.

Research questions. 1. What is the relationship between response accuracy and response speed? 2. What is the correlation between response accuracy and type of response time (fast response vs slow response) after controlling ability of people?

Preliminary Findings. 1. There is a negative relationship between response time and response accuracy. The success rate declines with elapsing response time. 2. The correlation between the two response latent variables (fast and slow) is 1.0, indicating the time on task effects between respond time types are not different.

Implications. The right amount of testing time in CAT is important\—too much is wasteful and costly, too little impacts score validity. The study is expected to provide new perception on the relationship between response time and response accuracy, which in turn, contribute to the best timing strategy in CAT\—with or without time constraints.

Session Video

}, keywords = {CAT, response accuracy, Response time}, url = {https://drive.google.com/open?id=1yYP01bzGrKvJnfLwepcAoQQ2F4TdSvZ2}, author = {Yang Shi} } @conference {2630, title = {Scripted On-the-fly Multistage Testing}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

On-the-fly multistage testing (OMST) was introduced recently as a promising alternative to preassembled MST. A decidedly appealing feature of both is the reviewability of items within the current stage. However, the fundamental difference is that, instead of routing to a preassembled module, OMST adaptively assembles a module at each stage according to an interim ability estimate. This produces more individualized forms with finer measurement precision, but imposing nonstatistical constraints and controlling item exposure become more cumbersome. One recommendation is to use the maximum priority index followed by a remediation step to satisfy content constraints, and the Sympson-Hetter method with a stratified item bank for exposure control.

However, these methods can be computationally expensive, thereby impeding practical implementation. Therefore, this study investigated the script method as a simpler solution to the challenge of strict content balancing and effective item exposure control in OMST. The script method was originally devised as an item selection algorithm for CAT and generally proceeds as follows: For a test with m items, there are m slots to be filled, and an item is selected according to pre-defined rules for each slot. For the first slot, randomly select an item from a designated content area (collection). For each subsequent slot, 1) Discard any enemies of items already administered in previous slots; 2) Draw a designated number of candidate items (selection length) from the designated collection according to the current ability estimate; 3) Randomly select one item from the set of candidates. There are two distinct features of the script method. First, a predetermined sequence of collections guarantees meeting content specifications. The specific ordering may be determined either randomly or deliberately by content experts. Second, steps 2 and 3 depict a method of exposure control, in which selection length balances item usage at the possible expense of ability estimation accuracy. The adaptation of the script method to OMST is straightforward. For the first module, randomly select each item from a designated collection. For each subsequent module, the process is the same as in scripted CAT (SCAT) except the same ability estimate is used for the selection of all items within the module. A series of simulations was conducted to evaluate the performance of scripted OMST (SOMST, with 3 or 4 evenly divided stages) relative to SCAT under various item exposure restrictions. In all conditions, reliability was maximized by programming an optimization algorithm that searches for the smallest possible selection length for each slot within the constraints. Preliminary results indicated that SOMST is certainly a capable design with performance comparable to that of SCAT. The encouraging findings and ease of implementation highly motivate the prospect of operational use for large-scale assessments.

Presentation Video

}, keywords = {CAT, multistage testing, On-the-fly testing}, url = {https://drive.google.com/open?id=1wKuAstITLXo6BM4APf2mPsth1BymNl-y}, author = {Edison Choe and Bruce Williams and Sung-Hyuck Lee} } @conference {2633, title = {Using Bayesian Decision Theory in Cognitive Diagnosis Computerized Adaptive Testing}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata Japan}, abstract = {

Cognitive diagnosis computerized adaptive testing (CD-CAT) purports to provide each individual a profile about the strengths and weaknesses of attributes or skills with computerized adaptive testing. In the CD-CAT literature, researchers dedicated to evolving item selection algorithms to improve measurement efficiency, and most algorithms were developed based on information theory. By the discontinuous nature of the latent variables in CD-CAT, this study introduced an alternative for item selection, called the minimum expected cost (MEC) method, which was derived based on Bayesian decision theory. Using simulations, the MEC method was evaluated against the posterior weighted Kullback-Leibler (PWKL) information, the modified PWKL (MPWKL), and the mutual information (MI) methods by manipulating item bank quality, item selection algorithm, and termination rule. Results indicated that, regardless of item quality and termination criterion, the MEC, MPWKL, and MI methods performed very similarly and they all outperformed the PWKL method in classification accuracy and test efficiency, especially in short tests; the MEC method had more efficient item bank usage than the MPWKL and MI methods. Moreover, the MEC method could consider the costs of incorrect decisions and improve classification accuracy and test efficiency when a particular profile was of concern. All the results suggest the practicability of the MEC method in CD-CAT.

Session Video

}, keywords = {Bayesian Decision Theory, CD-CAT}, author = {Chia-Ling Hsu and Wen-Chung Wang and ShuYing Chen} } @conference {2661, title = {Using Computerized Adaptive Testing to Detect Students{\textquoteright} Misconceptions: Exploration of Item Selection}, booktitle = {IACAT 2017 Conference}, year = {2017}, month = {08/2017}, publisher = {Niigata Seiryo University}, organization = {Niigata Seiryo University}, address = {Niigata, Japan}, abstract = {

Owning misconceptions impedes learning, thus detecting misconceptions through assessments is crucial to facilitate teaching. However, most computerized adaptive testing (CAT) applications to diagnose examinees\’ attribute profiles focus on whether examinees mastering correct concepts or not. In educational scenario, teachers and students have to figure out the misconceptions underlying incorrect answers after obtaining the scores from assessments and then correct the corresponding misconceptions. The Scaling Individuals and Classifying Misconceptions (SICM) models proposed by Bradshaw and Templin (2014) fill this gap. SICMs can identify a student\’s misconceptions directly from the distractors of multiple-choice questions and report whether s/he own the misconceptions or not. Simultaneously, SICM models are able to estimate a continuous ability within the item response theory (IRT) framework to fulfill the needs of policy-driven assessment systems relying on scaling examinees\’ ability. However, the advantage of providing estimations for two types of latent variables also causes complexity of model estimation. More items are required to achieve the same accuracies for both classification and estimation compared to dichotomous DCMs and to IRT, respectively. Thus, we aim to develop a CAT using the SICM models (SICM-CAT) to estimate students\’ misconceptions and continuous abilities simultaneously using fewer items than a linear test.

To achieve this goal, in this study, our research questions mainly focus on establishing several item selection rules that target on providing both accurate classification results and continuous ability estimations using SICM-CAT. The first research question is which information criterion to be used. The Kullback\–Leibler (KL) divergence is the first choice, as it can naturally combine the continuous and discrete latent variables. Based on this criterion, we propose an item selection index that can nicely integrate the two types of information. Based on this index, the items selected in real time could discriminate the examinee\’s current misconception profile and ability estimates from other possible estimates to the most extent. The second research question is about how to adaptively balance the estimations of the misconception profile and the continuous latent ability. Mimic the idea of the Hybrid Design proposed by Wang et al. (2016), we propose a design framework which makes the item selection transition from the group-level to the item-level. We aim to explore several design questions, such as how to select the transiting point and which latent variable estimation should be targeted first.

Preliminary results indicated that the SICM-CAT based on the proposed item selection index could classify examinees into different latent classes and measure their latent abilities compared with the random selection method more accurately and reliably under all the simulation conditions. We plan to compare different CAT designs based on our proposed item selection rules with the best linear test as the next step. We expect that the SICM-CAT is able to use shorter test length while retaining the same accuracies and reliabilities.

References

Bradshaw, L., \& Templin, J. (2014). Combining item response theory and diagnostic classification models: A psychometric model for scaling ability and diagnosing misconceptions. Psychometrika, 79(3), 403-425.

Wang, S., Lin, H., Chang, H. H., \& Douglas, J. (2016). Hybrid computerized adaptive testing: from group sequential design to fully sequential design. Journal of Educational Measurement, 53(1), 45-62.

Session Video

}, keywords = {CAT, incorrect answering, Student Misconception}, author = {Yawei Shen and Yu Bao and Shiyu Wang and Laine Bradshaw} } @article {2455, title = {Implementing a CAT: The AMC Experience }, journal = {Journal of Computerized Adaptive Testing}, volume = {3}, year = {2015}, pages = {1-12}, type = {Applications and Implementations}, keywords = {adaptive, Assessment, computer, medical, online, Testing}, issn = {2165-6592}, doi = {10.7333/15100301001}, url = {http://www.iacat.org/jcat/index.php/jcat/article/view/52/25}, author = {Barnard, John J} } @article {2345, title = {Detecting Item Preknowledge in Computerized Adaptive Testing Using Information Theory and Combinatorial Optimization}, journal = {Journal of Computerized Adaptive Testing}, volume = {2}, year = {2014}, pages = {37-58}, keywords = {combinatorial optimization, hypothesis testing, item preknowledge, Kullback-Leibler divergence, simulated annealing., test security}, issn = {2165-6592}, doi = {10.7333/1410-0203037}, url = {http://www.iacat.org/jcat/index.php/jcat/article/view/36/18}, author = {Belov, D. I.} } @article {2292, title = {The Philosophical Aspects of IRT Equating: Modeling Drift to Evaluate Cohort Growth in Large-Scale Assessments}, journal = {Educational Measurement: Issues and Practice}, volume = {32}, number = {1}, year = {2013}, pages = {2{\textendash}14}, keywords = {cohort growth, construct-relevant drift, evaluation of scale drift, philosophical aspects of IRT equating}, issn = {1745-3992}, doi = {10.1111/emip.12000}, url = {http://dx.doi.org/10.1111/emip.12000}, author = {Taherbhai, Husein and Seo, Daeryong} } @conference {2080, title = {Adaptive Item Calibration and Norming: Unique Considerations of a Global Deployment}, booktitle = {Annual Conference of the International Association for Computerized Adaptive Testing}, year = {2011}, month = {10/2011}, keywords = {CAT, common item equating, Figural Reasoning Test, item calibration, norming}, author = {Alexander Schwall and Evan Sinar} } @conference {2077, title = {Building Affordable CD-CAT Systems for Schools To Address Today{\textquoteright}s Challenges In Assessment}, booktitle = {Annual Conference of the International Association for Computerized Adaptive Testing}, year = {2011}, keywords = {affordability, CAT, cost}, author = {Chang, Hua-Hua} } @article {2070, title = {catR: An R Package for Computerized Adaptive Testing}, journal = {Applied Psychological Measurement}, year = {2011}, abstract = {

Computerized adaptive testing (CAT) is an active current research field in psychometrics and educational measurement. However, there is very little software available to handle such adaptive tasks. The R package catR was developed to perform adaptive testing with as much flexibility as possible, in an attempt to provide a developmental and testing platform to the interested user. Several item-selection rules and ability estimators are implemented. The item bank can be provided by the user or randomly generated from parent distributions of item parameters. Three stopping rules are available. The output can be graphically displayed.

}, keywords = {computer program, computerized adaptive testing, Estimation, Item Response Theory}, doi = {10.1177/0146621611407482}, author = {Magis, D. and Ra{\^\i}che, G.} } @conference {2100, title = {Continuous Testing (an avenue for CAT research)}, booktitle = {Annual Conference of the International Association for Computerized Adaptive Testing}, year = {2011}, month = {10/2011}, abstract = {

Publishing an Adaptive Test

Problems with Publishing

Research Questions

}, keywords = {CAT, item filter, item filtration}, author = {G. Gage Kingsbury} } @conference {2083, title = {Detecting DIF between Conventional and Computerized Adaptive Testing: A Monte Carlo Study}, booktitle = {Annual Conference of the International Association for Computerized Adaptive Testing}, year = {2011}, month = {10/2011}, abstract = {

A comparison od two procedures, Modified Robust Z and 95\% Credible Interval, were compared in a Monte Carlo study. Both procedures evidenced adequate control of false positive DIF results.

}, keywords = {95\% Credible Interval, CAT, DIF, differential item function, modified robust Z statistic, Monte Carlo methodologies}, author = {Barth B. Riley and Adam C. Carle} } @conference {2099, title = {From Reliability to Validity: Expanding Adaptive Testing Practice to Find the Most Valid Score for Each Test Taker}, booktitle = {Annual Conference of the International Association for Computerized Adaptive Testing}, year = {2011}, month = {10/2011}, abstract = {

CAT is an exception to the traditional conception of validity. It is one of the few examples of individualized testing. Item difficulty is tailored to each examinee. The intent, however, is increased efficiency. Focus on reliability (reduced standard error); Equivalence with paper \& pencil tests is valued; Validity is enhanced through improved reliability.

How Else Might We Individualize Testing Using CAT?

An ISV-Based View of Validity

Test Event -- An examinee encounters a series of items in a particular context.

CAT Goal: individualize testing to address CIV threats to score validity (i.e., maximize ISV).

Some Research Issues:

}, keywords = {CAT, CIV, construct-irrelevant variance, Individual Score Validity, ISV, low test taking motivation, Reliability, validity}, author = {Steven L. Wise} } @conference {2079, title = {A Heuristic Of CAT Item Selection Procedure For Testlets}, booktitle = {Annual Conference of the International Association for Computerized Adaptive Testing}, year = {2011}, keywords = {CAT, shadow test, testlets}, author = {Yuehmei Chien and David Shin and Walter Denny Way} } @conference {2078, title = {High-throughput Health Status Measurement using CAT in the Era of Personal Genomics: Opportunities and Challenges}, booktitle = {Annual Conference of the International Association for Computerized Adaptive Testing}, year = {2011}, keywords = {CAT, health applications, PROMIS}, author = {Eswar Krishnan} } @conference {2106, title = {Item Selection Methods based on Multiple Objective Approaches for Classification of Respondents into Multiple Levels}, booktitle = {Annual Conference of the International Association for Computerized Adaptive Testing}, year = {2011}, month = {10/2011}, abstract = {

Is it possible to develop new item selection methods which take advantage of the fact that we want to classify into multiple categories? New methods: Taking multiple points on the ability scale into account; Based on multiple objective approaches.

Conclusions

}, keywords = {adaptive classification test, CAT, item selection, sequential classification test}, author = {Maaike van Groen and Theo Eggen and Bernard Veldkamp} } @conference {2090, title = {Moving beyond Efficiency to Allow CAT to Provide Better Diagnostic Information}, booktitle = {Annual Conference of the International Association for Computerized Adaptive Testing}, year = {2011}, month = {10/2011}, abstract = {
Future CATs will provide better diagnostic information to
\–Examinees
\–Regulators, Educators, Employers
\–Test Developers
This goal will be accomplished by
\–Smart CATs which collect additional information during the test
\–Psychomagic
The time is now for Reporting
}, keywords = {CAT, dianostic information, MIRT, Multiple unidimensional scales, psychomagic, smart CAT}, author = {Brian D. Bontempo} } @conference {2108, title = {Optimal Calibration Designs for Computerized Adaptive Testing}, booktitle = {Annual Conference of the International Association for Computerized Adaptive Testing}, year = {2011}, month = {10/2011}, abstract = {

Optimaztion

How can we exploit the advantages of Balanced Block Design while keeping the logistics manageable?

Homogeneous Designs: Overlap between test booklets as regular as possible

Conclusions:

}, keywords = {balanced block design, CAT, item calibration, optimization, Rasch}, author = {Angela Verschoor} } @conference {2098, title = {A Paradigm for Multinational Adaptive Testing}, booktitle = {Annual Conference of the International Association for Computerized Adaptive Testing}, year = {2011}, month = {10/2011}, abstract = {

Impact of Issues in \“Exported\” Adaptive Testing

Goal is construct equivalency in the new environment

Research Questions

}, keywords = {CAT, multinational adaptive testing}, author = {A Zara} } @conference {2081, title = {Practitioner{\textquoteright}s Approach to Identify Item Drift in CAT}, booktitle = {Annual Conference of the International Association for Computerized Adaptive Testing}, year = {2011}, month = {10/2011}, keywords = {CUSUM method, G2 statistic, IPA, item drift, item parameter drift, Lord{\textquoteright}s chi-square statistic, Raju{\textquoteright}s NCDIF}, author = {Huijuan Meng and Susan Steinkamp and Paul Jones and Joy Matthews-Lopez} } @conference {2076, title = {Small-Sample Shadow Testing}, booktitle = {Annual Conference of the International Association for Computerized Adaptive Testing}, year = {2011}, keywords = {CAT, shadow test}, author = {Wallace Judd} } @conference {2105, title = {A Test Assembly Model for MST}, booktitle = {Annual Conference of the International Association for Computerized Adaptive Testing}, year = {2011}, month = {10/2011}, abstract = {

This study is just a short exploration in the matter of optimization of a MST. It is extremely hard or maybe impossible to chart influence of item pool and test specifications on optimization process. Simulations are very helpful in finding an acceptable MST.

}, keywords = {CAT, mst, multistage testing, Rasch, routing, tif}, author = {Angela Verschoor and Ingrid Radtke and Theo Eggen} } @conference {2107, title = {The Use of Decision Trees for Adaptive Item Selection and Score Estimation}, booktitle = {Annual Conference of the International Association for Computerized Adaptive Testing}, year = {2011}, month = {10/2011}, abstract = {

Conducted post-hoc simulations comparing the relative efficiency, and precision of decision trees (using CHAID and CART) vs. IRT-based CAT.

Conclusions

Decision tree methods were more efficient than CAT

But,...

Conclusions

CAT selects items based on two criteria: Item location relative to current estimate of theta, Item discrimination

Decision Trees select items that best discriminate between groups defined by the total score.

CAT is optimal only when trait level is well estimated.
Findings suggest that combining decision tree followed by CAT item selection may be advantageous.

}, keywords = {adaptive item selection, CAT, decision tree}, author = {Barth B. Riley and Rodney Funk and Michael L. Dennis and Richard D. Lennox and Matthew Finkelman} } @conference {2101, title = {Walking the Tightrope: Using Better Content Control to Improve CAT}, booktitle = {Annual Conference of the International Association for Computerized Adaptive Testing}, year = {2011}, month = {10/2011}, abstract = {

All testing involves a balance between measurement precision and content considerations. CAT item-selection algorithms have evolved to accommodate content considerations. Reviews CAT evolution including: Original/\”Pure\” adaptive exams, Constrained CAT, Weighted-deviations method, Shadow-Test Approach, Testlets instead of fully adapted tests, Administration of one item may preclude the administration of other item(s), and item relationships.

Research Questions

}, keywords = {CAT, CAT evolution, test content}, author = {Kathleen A. Gialluca} } @article {415, title = {Bayesian item selection in constrained adaptive testing}, journal = {Psicologica}, volume = {31}, number = {1}, year = {2010}, pages = {149-169}, abstract = {Application of Bayesian item selection criteria in computerized adaptive testing might result in improvement of bias and MSE of the ability estimates. The question remains how to apply Bayesian item selection criteria in the context of constrained adaptive testing, where large numbers of specifications have to be taken into account in the item selection process. The Shadow Test Approach is a general purpose algorithm for administering constrained CAT. In this paper it is shown how the approach can be slightly modified to handle Bayesian item selection criteria. No differences in performance were found between the shadow test approach and the modifiedapproach. In a simulation study of the LSAT, the effects of Bayesian item selection criteria are illustrated. The results are compared to item selection based on Fisher Information. General recommendations about the use of Bayesian item selection criteria are provided.}, keywords = {computerized adaptive testing}, author = {Veldkamp, B. P.} } @article {113, title = {Detection of aberrant item score patterns in computerized adaptive testing: An empirical example using the CUSUM}, journal = {Personality and Individual Differences}, volume = {48}, number = {8}, year = {2010}, pages = {921-925}, abstract = {The scalability of individual trait scores on a computerized adaptive test (CAT) was assessed through investigating the consistency of individual item score patterns. A sample of N = 428 persons completed a personality CAT as part of a career development procedure. To detect inconsistent item score patterns, we used a cumulative sum (CUSUM) procedure. Combined information from the CUSUM, other personality measures, and interviews showed that similar estimated trait values may have a different interpretation.Implications for computer-based assessment are discussed.}, keywords = {CAT, computerized adaptive testing, CUSUM approach, person Fit}, isbn = {01918869}, author = {Egberink, I. J. L. and Meijer, R. R. and Veldkamp, B. P. and Schakel, L. and Smid, N. G.} } @article {46, title = {Development and validation of patient-reported outcome measures for sleep disturbance and sleep-related impairments}, journal = {Sleep}, volume = {33}, number = {6}, year = {2010}, note = {Buysse, Daniel JYu, LanMoul, Douglas EGermain, AnneStover, AngelaDodds, Nathan EJohnston, Kelly LShablesky-Cade, Melissa APilkonis, Paul AAR052155/AR/NIAMS NIH HHS/United StatesU01AR52155/AR/NIAMS NIH HHS/United StatesU01AR52158/AR/NIAMS NIH HHS/United StatesU01AR52170/AR/NIAMS NIH HHS/United StatesU01AR52171/AR/NIAMS NIH HHS/United StatesU01AR52177/AR/NIAMS NIH HHS/United StatesU01AR52181/AR/NIAMS NIH HHS/United StatesU01AR52186/AR/NIAMS NIH HHS/United StatesResearch Support, N.I.H., ExtramuralValidation StudiesUnited StatesSleepSleep. 2010 Jun 1;33(6):781-92.}, month = {Jun 1}, pages = {781-92}, edition = {2010/06/17}, abstract = {STUDY OBJECTIVES: To develop an archive of self-report questions assessing sleep disturbance and sleep-related impairments (SRI), to develop item banks from this archive, and to validate and calibrate the item banks using classic validation techniques and item response theory analyses in a sample of clinical and community participants. DESIGN: Cross-sectional self-report study. SETTING: Academic medical center and participant homes. PARTICIPANTS: One thousand nine hundred ninety-three adults recruited from an Internet polling sample and 259 adults recruited from medical, psychiatric, and sleep clinics. INTERVENTIONS: None. MEASUREMENTS AND RESULTS: This study was part of PROMIS (Patient-Reported Outcomes Information System), a National Institutes of Health Roadmap initiative. Self-report item banks were developed through an iterative process of literature searches, collecting and sorting items, expert content review, qualitative patient research, and pilot testing. Internal consistency, convergent validity, and exploratory and confirmatory factor analysis were examined in the resulting item banks. Factor analyses identified 2 preliminary item banks, sleep disturbance and SRI. Item response theory analyses and expert content review narrowed the item banks to 27 and 16 items, respectively. Validity of the item banks was supported by moderate to high correlations with existing scales and by significant differences in sleep disturbance and SRI scores between participants with and without sleep disorders. CONCLUSIONS: The PROMIS sleep disturbance and SRI item banks have excellent measurement properties and may prove to be useful for assessing general aspects of sleep and SRI with various groups of patients and interventions.}, keywords = {*Outcome Assessment (Health Care), *Self Disclosure, Adult, Aged, Aged, 80 and over, Cross-Sectional Studies, Factor Analysis, Statistical, Female, Humans, Male, Middle Aged, Psychometrics, Questionnaires, Reproducibility of Results, Sleep Disorders/*diagnosis, Young Adult}, isbn = {0161-8105 (Print)0161-8105 (Linking)}, author = {Buysse, D. J. and Yu, L. and Moul, D. E. and Germain, A. and Stover, A. and Dodds, N. E. and Johnston, K. L. and Shablesky-Cade, M. A. and Pilkonis, P. A.} } @article {2071, title = {Item Selection and Hypothesis Testing for the Adaptive Measurement of Change}, journal = {Applied Psychological Measurement}, volume = {34}, year = {2010}, pages = {238-254}, abstract = {

Assessing individual change is an important topic in both psychological and educational measurement. An adaptive measurement of change (AMC) method had previously been shown to exhibit greater efficiency in detecting change than conventional nonadaptive methods. However, little work had been done to compare different procedures within the AMC framework. This study introduced a new item selection criterion and two new test statistics for detecting change with AMC that were specifically designed for the paradigm of hypothesis testing. In two simulation sets, the new methods for detecting significant change improved on existing procedures by demonstrating better adherence to Type I error rates and substantially better power for detecting relatively small change.\ 

}, keywords = {change, computerized adaptive testing, individual change, Kullback{\textendash}Leibler information, likelihood ratio, measuring change}, doi = {10.1177/0146621609344844}, author = {Finkelman, M. D. and Weiss, D. J. and Kim-Kang, G.} } @article {7, title = {Development and preliminary testing of a computerized adaptive assessment of chronic pain}, journal = {Journal of Pain}, volume = {10}, number = {9}, year = {2009}, note = {Anatchkova, Milena DSaris-Baglama, Renee NKosinski, MarkBjorner, Jakob B1R43AR052251-01A1/AR/NIAMS NIH HHS/United StatesEvaluation StudiesResearch Support, N.I.H., ExtramuralUnited StatesThe journal of pain : official journal of the American Pain SocietyJ Pain. 2009 Sep;10(9):932-43.}, month = {Sep}, pages = {932-943}, edition = {2009/07/15}, abstract = {The aim of this article is to report the development and preliminary testing of a prototype computerized adaptive test of chronic pain (CHRONIC PAIN-CAT) conducted in 2 stages: (1) evaluation of various item selection and stopping rules through real data-simulated administrations of CHRONIC PAIN-CAT; (2) a feasibility study of the actual prototype CHRONIC PAIN-CAT assessment system conducted in a pilot sample. Item calibrations developed from a US general population sample (N = 782) were used to program a pain severity and impact item bank (kappa = 45), and real data simulations were conducted to determine a CAT stopping rule. The CHRONIC PAIN-CAT was programmed on a tablet PC using QualityMetric{\textquoteright}s Dynamic Health Assessment (DYHNA) software and administered to a clinical sample of pain sufferers (n = 100). The CAT was completed in significantly less time than the static (full item bank) assessment (P < .001). On average, 5.6 items were dynamically administered by CAT to achieve a precise score. Scores estimated from the 2 assessments were highly correlated (r = .89), and both assessments discriminated across pain severity levels (P < .001, RV = .95). Patients{\textquoteright} evaluations of the CHRONIC PAIN-CAT were favorable. PERSPECTIVE: This report demonstrates that the CHRONIC PAIN-CAT is feasible for administration in a clinic. The application has the potential to improve pain assessment and help clinicians manage chronic pain.}, keywords = {*Computers, *Questionnaires, Activities of Daily Living, Adaptation, Psychological, Chronic Disease, Cohort Studies, Disability Evaluation, Female, Humans, Male, Middle Aged, Models, Psychological, Outcome Assessment (Health Care), Pain Measurement/*methods, Pain, Intractable/*diagnosis/psychology, Psychometrics, Quality of Life, User-Computer Interface}, isbn = {1528-8447 (Electronic)1526-5900 (Linking)}, author = {Anatchkova, M. D. and Saris-Baglama, R. N. and Kosinski, M. and Bjorner, J. B.} } @article {227, title = {An evaluation of patient-reported outcomes found computerized adaptive testing was efficient in assessing stress perception}, journal = {Journal of Clinical Epidemiology}, volume = {62}, number = {3}, year = {2009}, note = {Kocalevent, Ruya-DanielaRose, MatthiasBecker, JanineWalter, Otto BFliege, HerbertBjorner, Jakob BKleiber, DieterKlapp, Burghard FEvaluation StudiesUnited StatesJournal of clinical epidemiologyJ Clin Epidemiol. 2009 Mar;62(3):278-87, 287.e1-3. Epub 2008 Jul 18.}, pages = {278-287}, edition = {2008/07/22}, abstract = {OBJECTIVES: This study aimed to develop and evaluate a first computerized adaptive test (CAT) for the measurement of stress perception (Stress-CAT), in terms of the two dimensions: exposure to stress and stress reaction. STUDY DESIGN AND SETTING: Item response theory modeling was performed using a two-parameter model (Generalized Partial Credit Model). The evaluation of the Stress-CAT comprised a simulation study and real clinical application. A total of 1,092 psychosomatic patients (N1) were studied. Two hundred simulees (N2) were generated for a simulated response data set. Then the Stress-CAT was given to n=116 inpatients, (N3) together with established stress questionnaires as validity criteria. RESULTS: The final banks included n=38 stress exposure items and n=31 stress reaction items. In the first simulation study, CAT scores could be estimated with a high measurement precision (SE<0.32; rho>0.90) using 7.0+/-2.3 (M+/-SD) stress reaction items and 11.6+/-1.7 stress exposure items. The second simulation study reanalyzed real patients data (N1) and showed an average use of items of 5.6+/-2.1 for the dimension stress reaction and 10.0+/-4.9 for the dimension stress exposure. Convergent validity showed significantly high correlations. CONCLUSIONS: The Stress-CAT is short and precise, potentially lowering the response burden of patients in clinical decision making.}, keywords = {*Diagnosis, Computer-Assisted, Adolescent, Adult, Aged, Aged, 80 and over, Confidence Intervals, Female, Humans, Male, Middle Aged, Perception, Quality of Health Care/*standards, Questionnaires, Reproducibility of Results, Sickness Impact Profile, Stress, Psychological/*diagnosis/psychology, Treatment Outcome}, isbn = {1878-5921 (Electronic)0895-4356 (Linking)}, author = {Kocalevent, R. D. and Rose, M. and Becker, J. and Walter, O. B. and Fliege, H. and Bjorner, J. B. and Kleiber, D. and Klapp, B. F.} } @article {170, title = {Measuring global physical health in children with cerebral palsy: Illustration of a multidimensional bi-factor model and computerized adaptive testing}, journal = {Quality of Life Research}, volume = {18}, number = {3}, year = {2009}, note = {Haley, Stephen MNi, PengshengDumas, Helene MFragala-Pinkham, Maria AHambleton, Ronald KMontpetit, KathleenBilodeau, NathalieGorton, George EWatson, KyleTucker, Carole AK02 HD045354-01A1/HD/NICHD NIH HHS/United StatesK02 HD45354-01A1/HD/NICHD NIH HHS/United StatesResearch Support, N.I.H., ExtramuralResearch Support, Non-U.S. Gov{\textquoteright}tNetherlandsQuality of life research : an international journal of quality of life aspects of treatment, care and rehabilitationQual Life Res. 2009 Apr;18(3):359-70. Epub 2009 Feb 17.}, month = {Apr}, pages = {359-370}, edition = {2009/02/18}, abstract = {PURPOSE: The purposes of this study were to apply a bi-factor model for the determination of test dimensionality and a multidimensional CAT using computer simulations of real data for the assessment of a new global physical health measure for children with cerebral palsy (CP). METHODS: Parent respondents of 306 children with cerebral palsy were recruited from four pediatric rehabilitation hospitals and outpatient clinics. We compared confirmatory factor analysis results across four models: (1) one-factor unidimensional; (2) two-factor multidimensional (MIRT); (3) bi-factor MIRT with fixed slopes; and (4) bi-factor MIRT with varied slopes. We tested whether the general and content (fatigue and pain) person score estimates could discriminate across severity and types of CP, and whether score estimates from a simulated CAT were similar to estimates based on the total item bank, and whether they correlated as expected with external measures. RESULTS: Confirmatory factor analysis suggested separate pain and fatigue sub-factors; all 37 items were retained in the analyses. From the bi-factor MIRT model with fixed slopes, the full item bank scores discriminated across levels of severity and types of CP, and compared favorably to external instruments. CAT scores based on 10- and 15-item versions accurately captured the global physical health scores. CONCLUSIONS: The bi-factor MIRT CAT application, especially the 10- and 15-item versions, yielded accurate global physical health scores that discriminated across known severity groups and types of CP, and correlated as expected with concurrent measures. The CATs have potential for collecting complex data on the physical health of children with CP in an efficient manner.}, keywords = {*Computer Simulation, *Health Status, *Models, Statistical, Adaptation, Psychological, Adolescent, Cerebral Palsy/*physiopathology, Child, Child, Preschool, Factor Analysis, Statistical, Female, Humans, Male, Massachusetts, Pennsylvania, Questionnaires, Young Adult}, isbn = {0962-9343 (Print)0962-9343 (Linking)}, author = {Haley, S. M. and Ni, P. and Dumas, H. M. and Fragala-Pinkham, M. A. and Hambleton, R. K. and Montpetit, K. and Bilodeau, N. and Gorton, G. E. and Watson, K. and Tucker, C. A.} } @article {173, title = {Replenishing a computerized adaptive test of patient-reported daily activity functioning}, journal = {Quality of Life Research}, volume = {18}, number = {4}, year = {2009}, note = {Haley, Stephen MNi, PengshengJette, Alan MTao, WeiMoed, RichardMeyers, DougLudlow, Larry HK02 HD45354-01/HD/NICHD NIH HHS/United StatesResearch Support, N.I.H., ExtramuralNetherlandsQuality of life research : an international journal of quality of life aspects of treatment, care and rehabilitationQual Life Res. 2009 May;18(4):461-71. Epub 2009 Mar 14.}, month = {May}, pages = {461-71}, edition = {2009/03/17}, abstract = {PURPOSE: Computerized adaptive testing (CAT) item banks may need to be updated, but before new items can be added, they must be linked to the previous CAT. The purpose of this study was to evaluate 41 pretest items prior to including them into an operational CAT. METHODS: We recruited 6,882 patients with spine, lower extremity, upper extremity, and nonorthopedic impairments who received outpatient rehabilitation in one of 147 clinics across 13 states of the USA. Forty-one new Daily Activity (DA) items were administered along with the Activity Measure for Post-Acute Care Daily Activity CAT (DA-CAT-1) in five separate waves. We compared the scoring consistency with the full item bank, test information function (TIF), person standard errors (SEs), and content range of the DA-CAT-1 to the new CAT (DA-CAT-2) with the pretest items by real data simulations. RESULTS: We retained 29 of the 41 pretest items. Scores from the DA-CAT-2 were more consistent (ICC = 0.90 versus 0.96) than DA-CAT-1 when compared with the full item bank. TIF and person SEs were improved for persons with higher levels of DA functioning, and ceiling effects were reduced from 16.1\% to 6.1\%. CONCLUSIONS: Item response theory and online calibration methods were valuable in improving the DA-CAT.}, keywords = {*Activities of Daily Living, *Disability Evaluation, *Questionnaires, *User-Computer Interface, Adult, Aged, Cohort Studies, Computer-Assisted Instruction, Female, Humans, Male, Middle Aged, Outcome Assessment (Health Care)/*methods}, isbn = {0962-9343 (Print)0962-9343 (Linking)}, author = {Haley, S. M. and Ni, P. and Jette, A. M. and Tao, W. and Moed, R. and Meyers, D. and Ludlow, L. H.} } @article {88, title = {Assessing self-care and social function using a computer adaptive testing version of the pediatric evaluation of disability inventory}, journal = {Archives of Physical Medicine and Rehabilitation}, volume = {89}, number = {4}, year = {2008}, note = {Coster, Wendy JHaley, Stephen MNi, PengshengDumas, Helene MFragala-Pinkham, Maria AK02 HD45354-01A1/HD/NICHD NIH HHS/United StatesR41 HD052318-01A1/HD/NICHD NIH HHS/United StatesR43 HD42388-01/HD/NICHD NIH HHS/United StatesComparative StudyResearch Support, N.I.H., ExtramuralUnited StatesArchives of physical medicine and rehabilitationArch Phys Med Rehabil. 2008 Apr;89(4):622-9.}, month = {Apr}, pages = {622-629}, edition = {2008/04/01}, abstract = {OBJECTIVE: To examine score agreement, validity, precision, and response burden of a prototype computer adaptive testing (CAT) version of the self-care and social function scales of the Pediatric Evaluation of Disability Inventory compared with the full-length version of these scales. DESIGN: Computer simulation analysis of cross-sectional and longitudinal retrospective data; cross-sectional prospective study. SETTING: Pediatric rehabilitation hospital, including inpatient acute rehabilitation, day school program, outpatient clinics; community-based day care, preschool, and children{\textquoteright}s homes. PARTICIPANTS: Children with disabilities (n=469) and 412 children with no disabilities (analytic sample); 38 children with disabilities and 35 children without disabilities (cross-validation sample). INTERVENTIONS: Not applicable. MAIN OUTCOME MEASURES: Summary scores from prototype CAT applications of each scale using 15-, 10-, and 5-item stopping rules; scores from the full-length self-care and social function scales; time (in seconds) to complete assessments and respondent ratings of burden. RESULTS: Scores from both computer simulations and field administration of the prototype CATs were highly consistent with scores from full-length administration (r range, .94-.99). Using computer simulation of retrospective data, discriminant validity, and sensitivity to change of the CATs closely approximated that of the full-length scales, especially when the 15- and 10-item stopping rules were applied. In the cross-validation study the time to administer both CATs was 4 minutes, compared with over 16 minutes to complete the full-length scales. CONCLUSIONS: Self-care and social function score estimates from CAT administration are highly comparable with those obtained from full-length scale administration, with small losses in validity and precision and substantial decreases in administration time.}, keywords = {*Disability Evaluation, *Social Adjustment, Activities of Daily Living, Adolescent, Age Factors, Child, Child, Preschool, Computer Simulation, Cross-Over Studies, Disabled Children/*rehabilitation, Female, Follow-Up Studies, Humans, Infant, Male, Outcome Assessment (Health Care), Reference Values, Reproducibility of Results, Retrospective Studies, Risk Factors, Self Care/*standards/trends, Sex Factors, Sickness Impact Profile}, isbn = {1532-821X (Electronic)0003-9993 (Linking)}, author = {Coster, W. J. and Haley, S. M. and Ni, P. and Dumas, H. M. and Fragala-Pinkham, M. A.} } @article {2102, title = {Computer Adaptive-Attribute Testing A New Approach to Cognitive Diagnostic Assessment}, journal = {Zeitschrift f{\"u}r Psychologie / Journal of Psychology}, volume = {216}, year = {2008}, pages = {29-39}, abstract = {

The influence of interdisciplinary forces stemming from developments in cognitive science,mathematical statistics, educational
psychology, and computing science are beginning to appear in educational and psychological assessment. Computer adaptive-attribute testing (CA-AT) is one example. The concepts and procedures in CA-AT can be found at the intersection between computer adaptive testing and cognitive diagnostic assessment. CA-AT allows us to fuse the administrative benefits of computer adaptive testing with the psychological benefits of cognitive diagnostic assessment to produce an innovative psychologically-based adaptive testing approach. We describe the concepts behind CA-AT as well as illustrate how it can be used to promote formative, computer-based, classroom assessment.

}, keywords = {cognition and assessment, cognitive diagnostic assessment, computer adaptive testing}, doi = {10.1027/0044-3409.216.1.29}, author = {Gierl, M. J. and Zhou, J.} } @article {169, title = {Computerized adaptive testing for follow-up after discharge from inpatient rehabilitation: II. Participation outcomes}, journal = {Archives of Physical Medicine and Rehabilitation}, volume = {89}, number = {2}, year = {2008}, note = {Haley, Stephen MGandek, BarbaraSiebens, HilaryBlack-Schaffer, Randie MSinclair, Samuel JTao, WeiCoster, Wendy JNi, PengshengJette, Alan MK02 HD045354-01A1/HD/NICHD NIH HHS/United StatesK02 HD45354-01/HD/NICHD NIH HHS/United StatesR01 HD043568/HD/NICHD NIH HHS/United StatesR01 HD043568-01/HD/NICHD NIH HHS/United StatesResearch Support, N.I.H., ExtramuralUnited StatesArchives of physical medicine and rehabilitationArch Phys Med Rehabil. 2008 Feb;89(2):275-83.}, month = {Feb}, pages = {275-283}, edition = {2008/01/30}, abstract = {OBJECTIVES: To measure participation outcomes with a computerized adaptive test (CAT) and compare CAT and traditional fixed-length surveys in terms of score agreement, respondent burden, discriminant validity, and responsiveness. DESIGN: Longitudinal, prospective cohort study of patients interviewed approximately 2 weeks after discharge from inpatient rehabilitation and 3 months later. SETTING: Follow-up interviews conducted in patient{\textquoteright}s home setting. PARTICIPANTS: Adults (N=94) with diagnoses of neurologic, orthopedic, or medically complex conditions. INTERVENTIONS: Not applicable. MAIN OUTCOME MEASURES: Participation domains of mobility, domestic life, and community, social, \& civic life, measured using a CAT version of the Participation Measure for Postacute Care (PM-PAC-CAT) and a 53-item fixed-length survey (PM-PAC-53). RESULTS: The PM-PAC-CAT showed substantial agreement with PM-PAC-53 scores (intraclass correlation coefficient, model 3,1, .71-.81). On average, the PM-PAC-CAT was completed in 42\% of the time and with only 48\% of the items as compared with the PM-PAC-53. Both formats discriminated across functional severity groups. The PM-PAC-CAT had modest reductions in sensitivity and responsiveness to patient-reported change over a 3-month interval as compared with the PM-PAC-53. CONCLUSIONS: Although continued evaluation is warranted, accurate estimates of participation status and responsiveness to change for group-level analyses can be obtained from CAT administrations, with a sizeable reduction in respondent burden.}, keywords = {*Activities of Daily Living, *Adaptation, Physiological, *Computer Systems, *Questionnaires, Adult, Aged, Aged, 80 and over, Chi-Square Distribution, Factor Analysis, Statistical, Female, Humans, Longitudinal Studies, Male, Middle Aged, Outcome Assessment (Health Care)/*methods, Patient Discharge, Prospective Studies, Rehabilitation/*standards, Subacute Care/*standards}, isbn = {1532-821X (Electronic)0003-9993 (Linking)}, author = {Haley, S. M. and Gandek, B. and Siebens, H. and Black-Schaffer, R. M. and Sinclair, S. J. and Tao, W. and Coster, W. J. and Ni, P. and Jette, A. M.} } @article {2103, title = {Computerized Adaptive Testing of Personality Traits}, journal = {Zeitschrift f{\"u}r Psychologie / Journal of Psychology}, volume = {216}, year = {2008}, pages = {12-21}, abstract = {

A computerized adaptive testing (CAT) procedure was simulated with ordinal polytomous personality data collected using a
conventional paper-and-pencil testing format. An adapted Dutch version of the dominance scale of Gough and Heilbrun\’s Adjective
Check List (ACL) was used. This version contained Likert response scales with five categories. Item parameters were estimated using Samejima\’s graded response model from the responses of 1,925 subjects. The CAT procedure was simulated using the responses of 1,517 other subjects. The value of the required standard error in the stopping rule of the CAT was manipulated. The relationship between CAT latent trait estimates and estimates based on all dominance items was studied. Additionally, the pattern of relationships between the CAT latent trait estimates and the other ACL scales was compared to that between latent trait estimates based on the entire item pool and the other ACL scales. The CAT procedure resulted in latent trait estimates qualitatively equivalent to latent trait estimates based on all items, while a substantial reduction of the number of used items could be realized (at the stopping rule of 0.4 about 33\% of the 36 items was used).

}, keywords = {Adaptive Testing, cmoputer-assisted testing, Item Response Theory, Likert scales, Personality Measures}, doi = {10.1027/0044-3409.216.1.12}, author = {Hol, A. M. and Vorst, H. C. M. and Mellenbergh, G. J.} } @article {307, title = {The D-optimality item selection criterion in the early stage of CAT: A study with the graded response model}, journal = {Journal of Educational and Behavioral Statistics}, volume = {33}, number = {1}, year = {2008}, pages = {88-110}, abstract = {During the early stage of computerized adaptive testing (CAT), item selection criteria based on Fisher{\textquoteright}s information often produce less stable latent trait estimates than the Kullback-Leibler global information criterion. Robustness against early stage instability has been reported for the D-optimality criterion in a polytomous CAT with the Nominal Response Model and is shown herein to be reproducible for the Graded Response Model. For comparative purposes, the A-optimality and the global information criteria are also applied. Their item selection is investigated as a function of test progression and item bank composition. The results indicate how the selection of specific item parameters underlies the criteria performances evaluated via accuracy and precision of estimation. In addition, the criteria item exposure rates are compared, without the use of any exposure controlling measure. On the account of stability, precision, accuracy, numerical simplicity, and less evidently, item exposure rate, the D-optimality criterion can be recommended for CAT.}, keywords = {computerized adaptive testing, D optimality, item selection}, author = {Passos, V. L. and Berger, M. P. F. and Tan, F. E. S.} } @article {5, title = {Efficiency and sensitivity of multidimensional computerized adaptive testing of pediatric physical functioning}, journal = {Disability \& Rehabilitation}, volume = {30}, number = {6}, year = {2008}, note = {Allen, Diane DNi, PengshengHaley, Stephen MK02 HD45354-01/HD/NICHD NIH HHS/United StatesNIDDR H133P0001/DD/NCBDD CDC HHS/United StatesResearch Support, N.I.H., ExtramuralEnglandDisability and rehabilitationDisabil Rehabil. 2008;30(6):479-84.}, pages = {479-84}, edition = {2008/02/26}, abstract = {PURPOSE: Computerized adaptive tests (CATs) have efficiency advantages over fixed-length tests of physical functioning but may lose sensitivity when administering extremely low numbers of items. Multidimensional CATs may efficiently improve sensitivity by capitalizing on correlations between functional domains. Using a series of empirical simulations, we assessed the efficiency and sensitivity of multidimensional CATs compared to a longer fixed-length test. METHOD: Parent responses to the Pediatric Evaluation of Disability Inventory before and after intervention for 239 children at a pediatric rehabilitation hospital provided the data for this retrospective study. Reliability, effect size, and standardized response mean were compared between full-length self-care and mobility subscales and simulated multidimensional CATs with stopping rules at 40, 30, 20, and 10 items. RESULTS: Reliability was lowest in the 10-item CAT condition for the self-care (r = 0.85) and mobility (r = 0.79) subscales; all other conditions had high reliabilities (r > 0.94). All multidimensional CAT conditions had equivalent levels of sensitivity compared to the full set condition for both domains. CONCLUSIONS: Multidimensional CATs efficiently retain the sensitivity of longer fixed-length measures even with 5 items per dimension (10-item CAT condition). Measuring physical functioning with multidimensional CATs could enhance sensitivity following intervention while minimizing response burden.}, keywords = {*Disability Evaluation, Child, Computers, Disabled Children/*classification/rehabilitation, Efficiency, Humans, Outcome Assessment (Health Care), Psychometrics, Reproducibility of Results, Retrospective Studies, Self Care, Sensitivity and Specificity}, isbn = {0963-8288 (Print)0963-8288 (Linking)}, author = {Allen, D. D. and Ni, P. and Haley, S. M.} } @article {225, title = {ICAT: An adaptive testing procedure for the identification of idiosyncratic knowledge patterns}, journal = {Zeitschrift f{\"u}r Psychologie}, volume = {216}, number = {1}, year = {2008}, pages = {40-48}, abstract = {

Traditional adaptive tests provide an efficient method for estimating student achievements levels, by adjusting the characteristicsof the test questions to match the performance of each student. These traditional adaptive tests are not designed to identify diosyncraticknowledge patterns. As students move through their education, they learn content in any number of different ways related to their learning style and cognitive development. This may result in a student having different achievement levels from one content area to another within a domain of content. This study investigates whether such idiosyncratic knowledge patterns exist. It discusses the differences between idiosyncratic knowledge patterns and multidimensionality. Finally, it proposes an adaptive testing procedure that can be used to identify a student\’s areas of strength and weakness more efficiently than current adaptive testing approaches. The findings of the study indicate that a fairly large number of students may have test results that are influenced by their idiosyncratic knowledge patterns. The findings suggest that these patterns persist across time for a large number of students, and that the differences in student performance between content areas within a subject domain are large enough to allow them to be useful in instruction. Given the existence of idiosyncratic patterns of knowledge, the proposed testing procedure may enable us to provide more useful information to teachers. It should also allow us to differentiate between idiosyncratic patterns or knowledge, and important mutidimensionality in the testing data.

}, keywords = {computerized adaptive testing}, author = {Kingsbury, G. G. and Houser, R.L.} } @article {84, title = {Letting the CAT out of the bag: Comparing computer adaptive tests and an 11-item short form of the Roland-Morris Disability Questionnaire}, journal = {Spine}, volume = {33}, number = {12}, year = {2008}, note = {Cook, Karon FChoi, Seung WCrane, Paul KDeyo, Richard AJohnson, Kurt LAmtmann, Dagmar5 P60-AR48093/AR/United States NIAMS5U01AR052171-03/AR/United States NIAMSComparative StudyResearch Support, N.I.H., ExtramuralUnited StatesSpineSpine. 2008 May 20;33(12):1378-83.}, month = {May 20}, pages = {1378-83}, edition = {2008/05/23}, abstract = {STUDY DESIGN: A post hoc simulation of a computer adaptive administration of the items of a modified version of the Roland-Morris Disability Questionnaire. OBJECTIVE: To evaluate the effectiveness of adaptive administration of back pain-related disability items compared with a fixed 11-item short form. SUMMARY OF BACKGROUND DATA: Short form versions of the Roland-Morris Disability Questionnaire have been developed. An alternative to paper-and-pencil short forms is to administer items adaptively so that items are presented based on a person{\textquoteright}s responses to previous items. Theoretically, this allows precise estimation of back pain disability with administration of only a few items. MATERIALS AND METHODS: Data were gathered from 2 previously conducted studies of persons with back pain. An item response theory model was used to calibrate scores based on all items, items of a paper-and-pencil short form, and several computer adaptive tests (CATs). RESULTS: Correlations between each CAT condition and scores based on a 23-item version of the Roland-Morris Disability Questionnaire ranged from 0.93 to 0.98. Compared with an 11-item short form, an 11-item CAT produced scores that were significantly more highly correlated with scores based on the 23-item scale. CATs with even fewer items also produced scores that were highly correlated with scores based on all items. For example, scores from a 5-item CAT had a correlation of 0.93 with full scale scores. Seven- and 9-item CATs correlated at 0.95 and 0.97, respectively. A CAT with a standard-error-based stopping rule produced scores that correlated at 0.95 with full scale scores. CONCLUSION: A CAT-based back pain-related disability measure may be a valuable tool for use in clinical and research contexts. Use of CAT for other common measures in back pain research, such as other functional scales or measures of psychological distress, may offer similar advantages.}, keywords = {*Disability Evaluation, *Health Status Indicators, Adult, Aged, Aged, 80 and over, Back Pain/*diagnosis/psychology, Calibration, Computer Simulation, Diagnosis, Computer-Assisted/*standards, Humans, Middle Aged, Models, Psychological, Predictive Value of Tests, Questionnaires/*standards, Reproducibility of Results}, isbn = {1528-1159 (Electronic)}, author = {Cook, K. F. and Choi, S. W. and Crane, P. K. and Deyo, R. A. and Johnson, K. L. and Amtmann, D.} } @article {287, title = {Measuring physical functioning in children with spinal impairments with computerized adaptive testing}, journal = {Journal of Pediatric Orthopedics}, volume = {28}, number = {3}, year = {2008}, note = {Mulcahey, M JHaley, Stephen MDuffy, TheresaPengsheng, NiBetz, Randal RK02 HD045354-01A1/HD/NICHD NIH HHS/United StatesUnited StatesJournal of pediatric orthopedicsJ Pediatr Orthop. 2008 Apr-May;28(3):330-5.}, month = {Apr-May}, pages = {330-5}, edition = {2008/03/26}, abstract = {BACKGROUND: The purpose of this study was to assess the utility of measuring current physical functioning status of children with scoliosis and kyphosis by applying computerized adaptive testing (CAT) methods. Computerized adaptive testing uses a computer interface to administer the most optimal items based on previous responses, reducing the number of items needed to obtain a scoring estimate. METHODS: This was a prospective study of 77 subjects (0.6-19.8 years) who were seen by a spine surgeon during a routine clinic visit for progress spine deformity. Using a multidimensional version of the Pediatric Evaluation of Disability Inventory CAT program (PEDI-MCAT), we evaluated content range, accuracy and efficiency, known-group validity, concurrent validity with the Pediatric Outcomes Data Collection Instrument, and test-retest reliability in a subsample (n = 16) within a 2-week interval. RESULTS: We found the PEDI-MCAT to have sufficient item coverage in both self-care and mobility content for this sample, although most patients tended to score at the higher ends of both scales. Both the accuracy of PEDI-MCAT scores as compared with a fixed format of the PEDI (r = 0.98 for both mobility and self-care) and test-retest reliability were very high [self-care: intraclass correlation (3,1) = 0.98, mobility: intraclass correlation (3,1) = 0.99]. The PEDI-MCAT took an average of 2.9 minutes for the parents to complete. The PEDI-MCAT detected expected differences between patient groups, and scores on the PEDI-MCAT correlated in expected directions with scores from the Pediatric Outcomes Data Collection Instrument domains. CONCLUSIONS: Use of the PEDI-MCAT to assess the physical functioning status, as perceived by parents of children with complex spinal impairments, seems to be feasible and achieves accurate and efficient estimates of self-care and mobility function. Additional item development will be needed at the higher functioning end of the scale to avoid ceiling effects for older children. LEVEL OF EVIDENCE: This is a level II prospective study designed to establish the utility of computer adaptive testing as an evaluation method in a busy pediatric spine practice.}, keywords = {*Disability Evaluation, Adolescent, Child, Child, Preschool, Computer Simulation, Cross-Sectional Studies, Disabled Children/*rehabilitation, Female, Humans, Infant, Kyphosis/*diagnosis/rehabilitation, Male, Prospective Studies, Reproducibility of Results, Scoliosis/*diagnosis/rehabilitation}, isbn = {0271-6798 (Print)0271-6798 (Linking)}, author = {Mulcahey, M. J. and Haley, S. M. and Duffy, T. and Pengsheng, N. and Betz, R. R.} } @article {17, title = {Rotating item banks versus restriction of maximum exposure rates in computerized adaptive testing}, journal = {Spanish Journal of Psychology}, volume = {11}, number = {2}, year = {2008}, note = {Barrada, Juan RamonOlea, JulioAbad, Francisco JoseResearch Support, Non-U.S. Gov{\textquoteright}tSpainThe Spanish journal of psychologySpan J Psychol. 2008 Nov;11(2):618-25.}, pages = {618-625}, edition = {2008/11/08}, abstract = {

If examinees were to know, beforehand, part of the content of a computerized adaptive test, their estimated trait levels would then have a marked positive bias. One of the strategies to avoid this consists of dividing a large item bank into several sub-banks and rotating the sub-bank employed (Ariel, Veldkamp \& van der Linden, 2004). This strategy permits substantial improvements in exposure control at little cost to measurement accuracy, However, we do not know whether this option provides better results than using the master bank with greater restriction in the maximum exposure rates (Sympson \& Hetter, 1985). In order to investigate this issue, we worked with several simulated banks of 2100 items, comparing them, for RMSE and overlap rate, with the same banks divided in two, three... up to seven sub-banks. By means of extensive manipulation of the maximum exposure rate in each bank, we found that the option of rotating banks slightly outperformed the option of restricting maximum exposure rate of the master bank by means of the Sympson-Hetter method.

}, keywords = {*Character, *Databases, *Software Design, Aptitude Tests/*statistics \& numerical data, Bias (Epidemiology), Computing Methodologies, Diagnosis, Computer-Assisted/*statistics \& numerical data, Educational Measurement/*statistics \& numerical data, Humans, Mathematical Computing, Psychometrics/statistics \& numerical data}, isbn = {1138-7416}, author = {Barrada, J and Olea, J. and Abad, F. J.} } @article {400, title = {Some new developments in adaptive testing technology}, journal = {Zeitschrift f{\"u}r Psychologie}, volume = {216}, number = {1}, year = {2008}, pages = {3-11}, abstract = {

In an ironic twist of history, modern psychological testing has returned to an adaptive format quite common when testing was not yet standardized. Important stimuli to the renewed interest in adaptive testing have been the development of item-response theory in psychometrics, which models the responses on test items using separate parameters for the items and test takers, and the use of computers in test administration, which enables us to estimate the parameter for a test taker and select the items in real time. This article reviews a selection from the latest developments in the technology of adaptive testing, such as constrained adaptive item selection, adaptive testing using rule-based item generation, multidimensional adaptive testing, adaptive use of test batteries, and the use of response times in adaptive testing.

}, keywords = {computerized adaptive testing}, author = {van der Linden, W. J.} } @article {210, title = {Computerized adaptive testing for measuring development of young children}, journal = {Statistics in Medicine}, volume = {26}, number = {13}, year = {2007}, note = {Jacobusse, GertBuuren, Stef vanEnglandStatistics in medicineStat Med. 2007 Jun 15;26(13):2629-38.}, month = {Jun 15}, pages = {2629-38}, edition = {2006/11/30}, abstract = {Developmental indicators that are used for routine measurement in The Netherlands are usually chosen to optimally identify delayed children. Measurements on the majority of children without problems are therefore quite imprecise. This study explores the use of computerized adaptive testing (CAT) to monitor the development of young children. CAT is expected to improve the measurement precision of the instrument. We do two simulation studies - one with real data and one with simulated data - to evaluate the usefulness of CAT. It is shown that CAT selects developmental indicators that maximally match the individual child, so that all children can be measured to the same precision.}, keywords = {*Child Development, *Models, Statistical, Child, Preschool, Diagnosis, Computer-Assisted/*statistics \& numerical data, Humans, Netherlands}, isbn = {0277-6715 (Print)}, author = {Jacobusse, G. and Buuren, S.} } @article {199, title = {Computerized adaptive testing for polytomous motivation items: Administration mode effects and a comparison with short forms}, journal = {Applied Psychological Measurement}, volume = {31}, number = {5}, year = {2007}, note = {10.1177/0146621606297314Journal; Peer Reviewed Journal; Journal Article}, pages = {412-429}, abstract = {In a randomized experiment (n=515), a computerized and a computerized adaptive test (CAT) are compared. The item pool consists of 24 polytomous motivation items. Although items are carefully selected, calibration data show that Samejima{\textquoteright}s graded response model did not fit the data optimally. A simulation study is done to assess possible consequences of model misfit. CAT efficiency was studied by a systematic comparison of the CAT with two types of conventional fixed length short forms, which are created to be good CAT competitors. Results showed no essential administration mode effects. Efficiency analyses show that CAT outperformed the short forms in almost all aspects when results are aggregated along the latent trait scale. The real and the simulated data results are very similar, which indicate that the real data results are not affected by model misfit. (PsycINFO Database Record (c) 2007 APA ) (journal abstract)}, keywords = {2220 Tests \& Testing, Adaptive Testing, Attitude Measurement, computer adaptive testing, Computer Assisted Testing, items, Motivation, polytomous motivation, Statistical Validity, Test Administration, Test Forms, Test Items}, isbn = {0146-6216}, author = {Hol, A. M. and Vorst, H. C. M. and Mellenbergh, G. J.} } @proceedings {389, title = {Computerized classification testing with composite hypotheses}, journal = {GMAC Conference on Computerized Adaptive Testing}, year = {2007}, note = {Proceedings of the 2007 GMAC Conference on Computerized Adaptive Testing. Retrieved [date] from www. psych. umn. edu/psylabs/CATCentral}, publisher = {Graduate Management Admissions Council}, address = {St. Paul, MN}, keywords = {computerized adaptive testing}, author = {Thompson, N. A. and Ro, S.} } @article {111, title = {Evaluation of computer adaptive testing systems}, journal = {International Journal of Web-Based Learning and Teaching Technologies}, volume = {2}, number = {1}, year = {2007}, pages = {70-87}, publisher = {IGI Global: US}, abstract = {Many educational organizations are trying to reduce the cost of the exams, the workload and delay of scoring, and the human errors. Also, they try to increase the accuracy and efficiency of the testing. Recently, most examination organizations use computer adaptive testing (CAT) as the method for large scale testing. This article investigates the current state of CAT systems and identifies their strengths and weaknesses. It evaluates 10 CAT systems using an evaluation framework of 15 domains categorized into three dimensions: educational, technical, and economical. The results show that the majority of the CAT systems give priority to security, reliability, and maintainability. However, they do not offer to the examinee any advanced support and functionalities. Also, the feedback to the examinee is limited and the presentation of the items is poor. Recommendations are made in order to enhance the overall quality of a CAT system. For example, alternative multimedia items should be available so that the examinee would choose a preferred media type. Feedback could be improved by providing more information to the examinee or providing information anytime the examinee wished. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {computer adaptive testing systems, examination organizations, systems evaluation}, isbn = {1548-1093 (Print); 1548-1107 (Electronic)}, author = {Economides, A. A. and Roupas, C} } @article {52, title = {Improving patient reported outcomes using item response theory and computerized adaptive testing}, journal = {Journal of Rheumatology}, volume = {34}, number = {6}, year = {2007}, note = {Chakravarty, Eliza FBjorner, Jakob BFries, James FAr052158/ar/niamsConsensus Development ConferenceResearch Support, N.I.H., ExtramuralCanadaThe Journal of rheumatologyJ Rheumatol. 2007 Jun;34(6):1426-31.}, month = {Jun}, pages = {1426-31}, edition = {2007/06/07}, abstract = {OBJECTIVE: Patient reported outcomes (PRO) are considered central outcome measures for both clinical trials and observational studies in rheumatology. More sophisticated statistical models, including item response theory (IRT) and computerized adaptive testing (CAT), will enable critical evaluation and reconstruction of currently utilized PRO instruments to improve measurement precision while reducing item burden on the individual patient. METHODS: We developed a domain hierarchy encompassing the latent trait of physical function/disability from the more general to most specific. Items collected from 165 English-language instruments were evaluated by a structured process including trained raters, modified Delphi expert consensus, and then patient evaluation. Each item in the refined data bank will undergo extensive analysis using IRT to evaluate response functions and measurement precision. CAT will allow for real-time questionnaires of potentially smaller numbers of questions tailored directly to each individual{\textquoteright}s level of physical function. RESULTS: Physical function/disability domain comprises 4 subdomains: upper extremity, trunk, lower extremity, and complex activities. Expert and patient review led to consensus favoring use of present-tense "capability" questions using a 4- or 5-item Likert response construct over past-tense "performance"items. Floor and ceiling effects, attribution of disability, and standardization of response categories were also addressed. CONCLUSION: By applying statistical techniques of IRT through use of CAT, existing PRO instruments may be improved to reduce questionnaire burden on the individual patients while increasing measurement precision that may ultimately lead to reduced sample size requirements for costly clinical trials.}, keywords = {*Rheumatic Diseases/physiopathology/psychology, Clinical Trials, Data Interpretation, Statistical, Disability Evaluation, Health Surveys, Humans, International Cooperation, Outcome Assessment (Health Care)/*methods, Patient Participation/*methods, Research Design/*trends, Software}, isbn = {0315-162X (Print)}, author = {Chakravarty, E. F. and Bjorner, J. B. and Fries, J.F.} } @article {363, title = {The initial development of an item bank to assess and screen for psychological distress in cancer patients}, journal = {Psycho-Oncology}, volume = {16}, number = {8}, year = {2007}, note = {10.1002/pon.1117Journal; Peer Reviewed Journal; Journal Article}, pages = {724-732}, abstract = {Psychological distress is a common problem among cancer patients. Despite the large number of instruments that have been developed to assess distress, their utility remains disappointing. This study aimed to use Rasch models to develop an item-bank which would provide the basis for better means of assessing psychological distress in cancer patients. An item bank was developed from eight psychological distress questionnaires using Rasch analysis to link common items. Items from the questionnaires were added iteratively with common items as anchor points and misfitting items (infit mean square > 1.3) removed, and unidimensionality assessed. A total of 4914 patients completed the questionnaires providing an initial pool of 83 items. Twenty items were removed resulting in a final pool of 63 items. Good fit was demonstrated and no additional factor structure was evident from the residuals. However, there was little overlap between item locations and person measures, since items mainly targeted higher levels of distress. The Rasch analysis allowed items to be pooled and generated a unidimensional instrument for measuring psychological distress in cancer patients. Additional items are required to more accurately assess patients across the whole continuum of psychological distress. (PsycINFO Database Record (c) 2007 APA ) (journal abstract)}, keywords = {3293 Cancer, cancer patients, Distress, initial development, Item Response Theory, Models, Neoplasms, Patients, Psychological, psychological distress, Rasch, Stress}, isbn = {1057-9249}, author = {Smith, A. B. and Rush, R. and Velikova, G. and Wall, L. and Wright, E. P. and Stark, D. and Selby, P. and Sharpe, M.} } @article {18, title = {Methods for restricting maximum exposure rate in computerized adaptative testing}, journal = {Methodology: European Journal of Research Methods for the Behavioral and Social Sciences}, volume = {3}, number = {1}, year = {2007}, pages = {14-23}, publisher = {Hogrefe \& Huber Publishers GmbH: Germany}, abstract = {The Sympson-Hetter (1985) method provides a means of controlling maximum exposure rate of items in Computerized Adaptive Testing. Through a series of simulations, control parameters are set that mark the probability of administration of an item on being selected. This method presents two main problems: it requires a long computation time for calculating the parameters and the maximum exposure rate is slightly above the fixed limit. Van der Linden (2003) presented two alternatives which appear to solve both of the problems. The impact of these methods in the measurement accuracy has not been tested yet. We show how these methods over-restrict the exposure of some highly discriminating items and, thus, the accuracy is decreased. It also shown that, when the desired maximum exposure rate is near the minimum possible value, these methods offer an empirical maximum exposure rate clearly above the goal. A new method, based on the initial estimation of the probability of administration and the probability of selection of the items with the restricted method (Revuelta \& Ponsoda, 1998), is presented in this paper. It can be used with the Sympson-Hetter method and with the two van der Linden{\textquoteright}s methods. This option, when used with Sympson-Hetter, speeds the convergence of the control parameters without decreasing the accuracy. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {computerized adaptive testing, item bank security, item exposure control, overlap rate, Sympson-Hetter method}, isbn = {1614-1881 (Print); 1614-2241 (Electronic)}, author = {Barrada, J and Olea, J. and Ponsoda, V.} } @article {53, title = {Patient-reported outcomes measurement and management with innovative methodologies and technologies}, journal = {Quality of Life Research}, volume = {16 Suppl 1}, year = {2007}, note = {Chang, Chih-HungR21CA113191/CA/NCI NIH HHS/United StatesResearch Support, N.I.H., ExtramuralNetherlandsQuality of life research : an international journal of quality of life aspects of treatment, care and rehabilitationQual Life Res. 2007;16 Suppl 1:157-66. Epub 2007 May 26.}, pages = {157-66}, edition = {2007/05/29}, abstract = {Successful integration of modern psychometrics and advanced informatics in patient-reported outcomes (PRO) measurement and management can potentially maximize the value of health outcomes research and optimize the delivery of quality patient care. Unlike the traditional labor-intensive paper-and-pencil data collection method, item response theory-based computerized adaptive testing methodologies coupled with novel technologies provide an integrated environment to collect, analyze and present ready-to-use PRO data for informed and shared decision-making. This article describes the needs, challenges and solutions for accurate, efficient and cost-effective PRO data acquisition and dissemination means in order to provide critical and timely PRO information necessary to actively support and enhance routine patient care in busy clinical settings.}, keywords = {*Health Status, *Outcome Assessment (Health Care), *Quality of Life, *Software, Computer Systems/*trends, Health Insurance Portability and Accountability Act, Humans, Patient Satisfaction, Questionnaires, United States}, isbn = {0962-9343 (Print)0962-9343 (Linking)}, author = {Chang, C-H.} } @article {387, title = {A practitioner{\textquoteright}s guide to variable-length computerized classification testing}, journal = {Practical Assessment, Research and Evaluation}, volume = {12 }, number = {1}, year = {2007}, month = {7/1/2009}, chapter = {January, 2007}, abstract = {Variable-length computerized classification tests, CCTs, (Lin \& Spray, 2000; Thompson, 2006) are a powerful and efficient approach to testing for the purpose of classifying examinees into groups. CCTs are designed by the specification of at least five technical components: psychometric model, calibrated item bank, starting point, item selection algorithm, and termination criterion. Several options exist for each of these CCT components, creating a myriad of possible designs. Confusion among designs is exacerbated by the lack of a standardized nomenclature. This article outlines the components of a CCT, common options for each component, and the interaction of options for different components, so that practitioners may more efficiently design CCTs. It also offers a suggestion of nomenclature. }, keywords = {CAT, classification, computer adaptive testing, computerized adaptive testing, Computerized classification testing}, author = {Thompson, N. A.} } @article {328, title = {Psychometric evaluation and calibration of health-related quality of life item banks: plans for the Patient-Reported Outcomes Measurement Information System (PROMIS)}, journal = {Medical Care}, volume = {45}, number = {5 Suppl 1}, year = {2007}, note = {Reeve, Bryce BHays, Ron DBjorner, Jakob BCook, Karon FCrane, Paul KTeresi, Jeanne AThissen, DavidRevicki, Dennis AWeiss, David JHambleton, Ronald KLiu, HonghuGershon, RichardReise, Steven PLai, Jin-sheiCella, DavidPROMIS Cooperative GroupAG015815/AG/United States NIAResearch Support, N.I.H., ExtramuralUnited StatesMedical careMed Care. 2007 May;45(5 Suppl 1):S22-31.}, month = {May}, pages = {S22-31}, edition = {2007/04/20}, abstract = {BACKGROUND: The construction and evaluation of item banks to measure unidimensional constructs of health-related quality of life (HRQOL) is a fundamental objective of the Patient-Reported Outcomes Measurement Information System (PROMIS) project. OBJECTIVES: Item banks will be used as the foundation for developing short-form instruments and enabling computerized adaptive testing. The PROMIS Steering Committee selected 5 HRQOL domains for initial focus: physical functioning, fatigue, pain, emotional distress, and social role participation. This report provides an overview of the methods used in the PROMIS item analyses and proposed calibration of item banks. ANALYSES: Analyses include evaluation of data quality (eg, logic and range checking, spread of response distribution within an item), descriptive statistics (eg, frequencies, means), item response theory model assumptions (unidimensionality, local independence, monotonicity), model fit, differential item functioning, and item calibration for banking. RECOMMENDATIONS: Summarized are key analytic issues; recommendations are provided for future evaluations of item banks in HRQOL assessment.}, keywords = {*Health Status, *Information Systems, *Quality of Life, *Self Disclosure, Adolescent, Adult, Aged, Calibration, Databases as Topic, Evaluation Studies as Topic, Female, Humans, Male, Middle Aged, Outcome Assessment (Health Care)/*methods, Psychometrics, Questionnaires/standards, United States}, isbn = {0025-7079 (Print)}, author = {Reeve, B. B. and Hays, R. D. and Bjorner, J. B. and Cook, K. F. and Crane, P. K. and Teresi, J. A. and Thissen, D. and Revicki, D. A. and Weiss, D. J. and Hambleton, R. K. and Liu, H. and Gershon, R. C. and Reise, S. P. and Lai, J. S. and Cella, D.} } @article {343, title = {Psychometric properties of an emotional adjustment measure: An application of the graded response model}, journal = {European Journal of Psychological Assessment}, volume = {23}, number = {1}, year = {2007}, pages = {39-46}, publisher = {Hogrefe \& Huber Publishers GmbH: Germany}, abstract = {Item response theory (IRT) provides valuable methods for the analysis of the psychometric properties of a psychological measure. However, IRT has been mainly used for assessing achievements and ability rather than personality factors. This paper presents an application of the IRT to a personality measure. Thus, the psychometric properties of a new emotional adjustment measure that consists of a 28-six graded response items is shown. Classical test theory (CTT) analyses as well as IRT analyses are carried out. Samejima{\textquoteright}s (1969) graded-response model has been used for estimating item parameters. Results show that the bank of items fulfills model assumptions and fits the data reasonably well, demonstrating the suitability of the IRT models for the description and use of data originating from personality measures. In this sense, the model fulfills the expectations that IRT has undoubted advantages: (1) The invariance of the estimated parameters, (2) the treatment given to the standard error of measurement, and (3) the possibilities offered for the construction of computerized adaptive tests (CAT). The bank of items shows good reliability. It also shows convergent validity compared to the Eysenck Personality Inventory (EPQ-A; Eysenck \& Eysenck, 1975) and the Big Five Questionnaire (BFQ; Caprara, Barbaranelli, \& Borgogni, 1993). (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {computerized adaptive tests, Emotional Adjustment, Item Response Theory, Personality Measures, personnel recruitment, Psychometrics, Samejima{\textquoteright}s graded response model, test reliability, validity}, isbn = {1015-5759 (Print)}, author = {Rubio, V. J. and Aguado, D. and Hontangas, P. M. and Hern{\'a}ndez, J. M.} } @article {306, title = {Test design optimization in CAT early stage with the nominal response model}, journal = {Applied Psychological Measurement}, volume = {31}, number = {3}, year = {2007}, pages = {213-232}, publisher = {Sage Publications: US}, abstract = {The early stage of computerized adaptive testing (CAT) refers to the phase of the trait estimation during the administration of only a few items. This phase can be characterized by bias and instability of estimation. In this study, an item selection criterion is introduced in an attempt to lessen this instability: the D-optimality criterion. A polytomous unconstrained CAT simulation is carried out to evaluate this criterion{\textquoteright}s performance under different test premises. The simulation shows that the extent of early stage instability depends primarily on the quality of the item pool information and its size and secondarily on the item selection criteria. The efficiency of the D-optimality criterion is similar to the efficiency of other known item selection criteria. Yet, it often yields estimates that, at the beginning of CAT, display a more robust performance against instability. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {computerized adaptive testing, nominal response model, robust performance, test design optimization}, isbn = {0146-6216 (Print)}, author = {Passos, V. L. and Berger, M. P. F. and Tan, F. E.} } @article {189, title = {Adaptive success control in computerized adaptive testing}, journal = {Psychology Science}, volume = {48}, number = {4}, year = {2006}, pages = {436-450}, publisher = {Pabst Science Publishers: Germany}, abstract = {In computerized adaptive testing (CAT) procedures within the framework of probabilistic test theory the difficulty of an item is adjusted to the ability of the respondent, with the aim of maximizing the amount of information generated per item, thereby also increasing test economy and test reasonableness. However, earlier research indicates that respondents might feel over-challenged by a constant success probability of p = 0.5 and therefore cannot come to a sufficiently high answer certainty within a reasonable timeframe. Consequently response time per item increases, which -- depending on the test material -- can outweigh the benefit of administering optimally informative items. Instead of a benefit, the result of using CAT procedures could be a loss of test economy. Based on this problem, an adaptive success control algorithm was designed and tested, adapting the success probability to the working style of the respondent. Persons who need higher answer certainty in order to come to a decision are detected and receive a higher success probability, in order to minimize the test duration (not the number of items as in classical CAT). The method is validated on the re-analysis of data from the Adaptive Matrices Test (AMT, Hornke, Etzel \& Rettig, 1999) and by the comparison between an AMT version using classical CAT and an experimental version using Adaptive Success Control. The results are discussed in the light of psychometric and psychological aspects of test quality. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {adaptive success control, computerized adaptive testing, Psychometrics}, isbn = {0033-3018 (Print)}, author = {H{\"a}usler, Joachim} } @article {310, title = {Applying Bayesian item selection approaches to adaptive tests using polytomous items}, journal = {Applied Measurement in Education}, volume = {19}, number = {1}, year = {2006}, pages = {1-20}, publisher = {Lawrence Erlbaum: US}, abstract = {This study applied the maximum expected information (MEI) and the maximum posterior- weighted information (MPI) approaches of computer adaptive testing item selection to the case of a test using polytomous items following the partial credit model. The MEI and MPI approaches are described. A simulation study compared the efficiency of ability estimation using the MEI and MPI approaches to the traditional maximal item information (MII) approach. The results of the simulation study indicated that the MEI and MPI approaches led to a superior efficiency of ability estimation compared with the MII approach. The superiority of the MEI and MPI approaches over the MII approach was greatest when the bank contained items having a relatively peaked information function. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {adaptive tests, Bayesian item selection, computer adaptive testing, maximum expected information, polytomous items, posterior weighted information}, isbn = {0895-7347 (Print); 1532-4818 (Electronic)}, author = {Penfield, R. D.} } @article {401, title = {Assembling a computerized adaptive testing item pool as a set of linear tests}, journal = {Journal of Educational and Behavioral Statistics}, volume = {31}, number = {1}, year = {2006}, pages = {81-99}, publisher = {Sage Publications: US}, abstract = {Test-item writing efforts typically results in item pools with an undesirable correlational structure between the content attributes of the items and their statistical information. If such pools are used in computerized adaptive testing (CAT), the algorithm may be forced to select items with less than optimal information, that violate the content constraints, and/or have unfavorable exposure rates. Although at first sight somewhat counterintuitive, it is shown that if the CAT pool is assembled as a set of linear test forms, undesirable correlations can be broken down effectively. It is proposed to assemble such pools using a mixed integer programming model with constraints that guarantee that each test meets all content specifications and an objective function that requires them to have maximal information at a well-chosen set of ability values. An empirical example with a previous master pool from the Law School Admission Test (LSAT) yielded a CAT with nearly uniform bias and mean-squared error functions for the ability estimator and item-exposure rates that satisfied the target for all items in the pool. }, keywords = {Algorithms, computerized adaptive testing, item pool, linear tests, mathematical models, statistics, Test Construction, Test Items}, isbn = {1076-9986 (Print)}, author = {van der Linden, W. J. and Ariel, A. and Veldkamp, B. P.} } @article {247, title = {Comparing methods of assessing differential item functioning in a computerized adaptive testing environment}, journal = {Journal of Educational Measurement}, volume = {43}, number = {3}, year = {2006}, pages = {245-264}, publisher = {Blackwell Publishing: United Kingdom}, abstract = {Mantel-Haenszel and SIBTEST, which have known difficulty in detecting non-unidirectional differential item functioning (DIF), have been adapted with some success for computerized adaptive testing (CAT). This study adapts logistic regression (LR) and the item-response-theory-likelihood-ratio test (IRT-LRT), capable of detecting both unidirectional and non-unidirectional DIF, to the CAT environment in which pretest items are assumed to be seeded in CATs but not used for trait estimation. The proposed adaptation methods were evaluated with simulated data under different sample size ratios and impact conditions in terms of Type I error, power, and specificity in identifying the form of DIF. The adapted LR and IRT-LRT procedures are more powerful than the CAT version of SIBTEST for non-unidirectional DIF detection. The good Type I error control provided by IRT-LRT under extremely unequal sample sizes and large impact is encouraging. Implications of these and other findings are discussed. all rights reserved)}, keywords = {computerized adaptive testing, educational testing, item response theory likelihood ratio test, logistic regression, trait estimation, unidirectional \& non-unidirectional differential item functioning}, isbn = {0022-0655 (Print)}, author = {Lei, P-W. and Chen, S-Y. and Yu, L.} } @article {164, title = {The comparison among item selection strategies of CAT with multiple-choice items}, journal = {Acta Psychologica Sinica}, volume = {38}, number = {5}, year = {2006}, pages = {778-783}, publisher = {Science Press: China}, abstract = {The initial purpose of comparing item selection strategies for CAT was to increase the efficiency of tests. As studies continued, however, it was found that increasing the efficiency of item bank using was also an important goal of comparing item selection strategies. These two goals often conflicted. The key solution was to find a strategy with which both goals could be accomplished. The item selection strategies for graded response model in this study included: the average of the difficulty orders matching with the ability; the medium of the difficulty orders matching with the ability; maximum information; A stratified (average); and A stratified (medium). The evaluation indexes used for comparison included: the bias of ability estimates for the true; the standard error of ability estimates; the average items which the examinees have administered; the standard deviation of the frequency of items selected; and sum of the indices weighted. Using the Monte Carlo simulation method, we obtained some data and computer iterated the data 20 times each under the conditions that the item difficulty parameters followed the normal distribution and even distribution. The results were as follows; The results indicated that no matter difficulty parameters followed the normal distribution or even distribution. Every type of item selection strategies designed in this research had its strong and weak points. In general evaluation, under the condition that items were stratified appropriately, A stratified (medium) (ASM) had the best effect. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {CAT, computerized adaptive testing, graded response model, item selection strategies, multiple choice items}, isbn = {0439-755X (Print)}, author = {Hai-qi, D. and De-zhi, C. and Shuliang, D. and Taiping, D.} } @article {172, title = {Computer adaptive testing improved accuracy and precision of scores over random item selection in a physical functioning item bank}, journal = {Journal of Clinical Epidemiology}, volume = {59}, number = {11}, year = {2006}, note = {Haley, Stephen MNi, PengshengHambleton, Ronald KSlavin, Mary DJette, Alan MK02 hd45354-01/hd/nichdR01 hd043568/hd/nichdComparative StudyResearch Support, N.I.H., ExtramuralResearch Support, U.S. Gov{\textquoteright}t, Non-P.H.S.EnglandJournal of clinical epidemiologyJ Clin Epidemiol. 2006 Nov;59(11):1174-82. Epub 2006 Jul 11.}, month = {Nov}, pages = {1174-82}, edition = {2006/10/10}, abstract = {BACKGROUND AND OBJECTIVE: Measuring physical functioning (PF) within and across postacute settings is critical for monitoring outcomes of rehabilitation; however, most current instruments lack sufficient breadth and feasibility for widespread use. Computer adaptive testing (CAT), in which item selection is tailored to the individual patient, holds promise for reducing response burden, yet maintaining measurement precision. We calibrated a PF item bank via item response theory (IRT), administered items with a post hoc CAT design, and determined whether CAT would improve accuracy and precision of score estimates over random item selection. METHODS: 1,041 adults were interviewed during postacute care rehabilitation episodes in either hospital or community settings. Responses for 124 PF items were calibrated using IRT methods to create a PF item bank. We examined the accuracy and precision of CAT-based scores compared to a random selection of items. RESULTS: CAT-based scores had higher correlations with the IRT-criterion scores, especially with short tests, and resulted in narrower confidence intervals than scores based on a random selection of items; gains, as expected, were especially large for low and high performing adults. CONCLUSION: The CAT design may have important precision and efficiency advantages for point-of-care functional assessment in rehabilitation practice settings.}, keywords = {*Recovery of Function, Activities of Daily Living, Adolescent, Adult, Aged, Aged, 80 and over, Confidence Intervals, Factor Analysis, Statistical, Female, Humans, Male, Middle Aged, Outcome Assessment (Health Care)/*methods, Rehabilitation/*standards, Reproducibility of Results, Software}, isbn = {0895-4356 (Print)}, author = {Haley, S. M. and Ni, P. and Hambleton, R. K. and Slavin, M. D. and Jette, A. M.} } @inbook {109, title = {Computer-based testing}, booktitle = {Handbook of multimethod measurement in psychology}, volume = {xiv}, year = {2006}, note = {Using Smart Source ParsingHandbook of multimethod measurement in psychology. (pp. 87-100). Washington, DC : American Psychological Association, [URL:http://www.apa.org/books]. xiv, 553 pp}, pages = {87-100}, publisher = {American Psychological Association}, organization = {American Psychological Association}, address = {Washington D.C. USA}, abstract = {(From the chapter) There has been a proliferation of research designed to explore and exploit opportunities provided by computer-based assessment. This chapter provides an overview of the diverse efforts by researchers in this area. It begins by describing how paper-and-pencil tests can be adapted for administration by computers. Computerization provides the important advantage that items can be selected so they are of appropriate difficulty for each examinee. Some of the psychometric theory needed for computerized adaptive testing is reviewed. Then research on innovative computerized assessments is summarized. These assessments go beyond multiple-choice items by using formats made possible by computerization. Then some hardware and software issues are described, and finally, directions for future work are outlined. (PsycINFO Database Record (c) 2006 APA )}, keywords = {Adaptive Testing computerized adaptive testing, Computer Assisted Testing, Experimentation, Psychometrics, Theories}, author = {F Drasgow and Chuah, S. C.} } @article {176, title = {Computerized adaptive testing for follow-up after discharge from inpatient rehabilitation: I. Activity outcomes}, journal = {Archives of Physical Medicine and Rehabilitation}, volume = {87}, number = {8}, year = {2006}, note = {Haley, Stephen MSiebens, HilaryCoster, Wendy JTao, WeiBlack-Schaffer, Randie MGandek, BarbaraSinclair, Samuel JNi, PengshengK0245354-01/phsR01 hd043568/hd/nichdResearch Support, N.I.H., ExtramuralUnited StatesArchives of physical medicine and rehabilitationArch Phys Med Rehabil. 2006 Aug;87(8):1033-42.}, month = {Aug}, pages = {1033-42}, edition = {2006/08/01}, abstract = {OBJECTIVE: To examine score agreement, precision, validity, efficiency, and responsiveness of a computerized adaptive testing (CAT) version of the Activity Measure for Post-Acute Care (AM-PAC-CAT) in a prospective, 3-month follow-up sample of inpatient rehabilitation patients recently discharged home. DESIGN: Longitudinal, prospective 1-group cohort study of patients followed approximately 2 weeks after hospital discharge and then 3 months after the initial home visit. SETTING: Follow-up visits conducted in patients{\textquoteright} home setting. PARTICIPANTS: Ninety-four adults who were recently discharged from inpatient rehabilitation, with diagnoses of neurologic, orthopedic, and medically complex conditions. INTERVENTIONS: Not applicable. MAIN OUTCOME MEASURES: Summary scores from AM-PAC-CAT, including 3 activity domains of movement and physical, personal care and instrumental, and applied cognition were compared with scores from a traditional fixed-length version of the AM-PAC with 66 items (AM-PAC-66). RESULTS: AM-PAC-CAT scores were in good agreement (intraclass correlation coefficient model 3,1 range, .77-.86) with scores from the AM-PAC-66. On average, the CAT programs required 43\% of the time and 33\% of the items compared with the AM-PAC-66. Both formats discriminated across functional severity groups. The standardized response mean (SRM) was greater for the movement and physical fixed form than the CAT; the effect size and SRM of the 2 other AM-PAC domains showed similar sensitivity between CAT and fixed formats. Using patients{\textquoteright} own report as an anchor-based measure of change, the CAT and fixed length formats were comparable in responsiveness to patient-reported change over a 3-month interval. CONCLUSIONS: Accurate estimates for functional activity group-level changes can be obtained from CAT administrations, with a considerable reduction in administration time.}, keywords = {*Activities of Daily Living, *Adaptation, Physiological, *Computer Systems, *Questionnaires, Adult, Aged, Aged, 80 and over, Chi-Square Distribution, Factor Analysis, Statistical, Female, Humans, Longitudinal Studies, Male, Middle Aged, Outcome Assessment (Health Care)/*methods, Patient Discharge, Prospective Studies, Rehabilitation/*standards, Subacute Care/*standards}, isbn = {0003-9993 (Print)}, author = {Haley, S. M. and Siebens, H. and Coster, W. J. and Tao, W. and Black-Schaffer, R. M. and Gandek, B. and Sinclair, S. J. and Ni, P.} } @article {399, title = {Equating scores from adaptive to linear tests}, journal = {Applied Psychological Measurement}, volume = {30}, number = {6}, year = {2006}, pages = {493-508}, publisher = {Sage Publications: US}, abstract = {Two local methods for observed-score equating are applied to the problem of equating an adaptive test to a linear test. In an empirical study, the methods were evaluated against a method based on the test characteristic function (TCF) of the linear test and traditional equipercentile equating applied to the ability estimates on the adaptive test for a population of test takers. The two local methods were generally best. Surprisingly, the TCF method performed slightly worse than the equipercentile method. Both methods showed strong bias and uniformly large inaccuracy, but the TCF method suffered from extra error due to the lower asymptote of the test characteristic function. It is argued that the worse performances of the two methods are a consequence of the fact that they use a single equating transformation for an entire population of test takers and therefore have to compromise between the individual score distributions. }, keywords = {computerized adaptive testing, equipercentile equating, local equating, score reporting, test characteristic function}, isbn = {0146-6216 (Print)}, author = {van der Linden, W. J.} } @article {35, title = {Expansion of a physical function item bank and development of an abbreviated form for clinical research}, journal = {Journal of Applied Measurement}, volume = {7}, number = {1}, year = {2006}, pages = {1-15}, publisher = {Richard M Smith: US}, abstract = {We expanded an existing 33-item physical function (PF) item bank with a sufficient number of items to enable computerized adaptive testing (CAT). Ten items were written to expand the bank and the new item pool was administered to 295 people with cancer. For this analysis of the new pool, seven poorly performing items were identified for further examination. This resulted in a bank with items that define an essentially unidimensional PF construct, cover a wide range of that construct, reliably measure the PF of persons with cancer, and distinguish differences in self-reported functional performance levels. We also developed a 5-item (static) assessment form ("BriefPF") that can be used in clinical research to express scores on the same metric as the overall bank. The BriefPF was compared to the PF-10 from the Medical Outcomes Study SF-36. Both short forms significantly differentiated persons across functional performance levels. While the entire bank was more precise across the PF continuum than either short form, there were differences in the area of the continuum in which each short form was more precise: the BriefPF was more precise than the PF-10 at the lower functional levels and the PF-10 was more precise than the BriefPF at the higher levels. Future research on this bank will include the development of a CAT version, the PF-CAT. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {clinical research, computerized adaptive testing, performance levels, physical function item bank, Psychometrics, test reliability, Test Validity}, isbn = {1529-7713 (Print)}, author = {Bode, R. K. and Lai, J-S. and Dineen, K. and Heinemann, A. W. and Shevrin, D. and Von Roenn, J. and Cella, D.} } @article {237, title = {Factor analysis techniques for assessing sufficient unidimensionality of cancer related fatigue}, journal = {Quality of Life Research}, volume = {15}, number = {7}, year = {2006}, note = {0962-9343 (Print)Journal ArticleResearch Support, N.I.H., Extramural}, month = {Sep}, pages = {1179-90}, abstract = {BACKGROUND: Fatigue is the most common unrelieved symptom experienced by people with cancer. The purpose of this study was to examine whether cancer-related fatigue (CRF) can be summarized using a single score, that is, whether CRF is sufficiently unidimensional for measurement approaches that require or assume unidimensionality. We evaluated this question using factor analysis techniques including the theory-driven bi-factor model. METHODS: Five hundred and fifty five cancer patients from the Chicago metropolitan area completed a 72-item fatigue item bank, covering a range of fatigue-related concerns including intensity, frequency and interference with physical, mental, and social activities. Dimensionality was assessed using exploratory and confirmatory factor analysis (CFA) techniques. RESULTS: Exploratory factor analysis (EFA) techniques identified from 1 to 17 factors. The bi-factor model suggested that CRF was sufficiently unidimensional. CONCLUSIONS: CRF can be considered sufficiently unidimensional for applications that require unidimensionality. One such application, item response theory (IRT), will facilitate the development of short-form and computer-adaptive testing. This may further enable practical and accurate clinical assessment of CRF.}, keywords = {*Factor Analysis, Statistical, *Quality of Life, Aged, Chicago, Fatigue/*etiology, Female, Humans, Male, Middle Aged, Neoplasms/*complications, Questionnaires}, author = {Lai, J-S. and Crane, P. K. and Cella, D.} } @article {314, title = {[Item Selection Strategies of Computerized Adaptive Testing based on Graded Response Model.]}, journal = {Acta Psychologica Sinica}, volume = {38}, number = {3}, year = {2006}, pages = {461-467}, publisher = {Science Press: China}, abstract = {Item selection strategy (ISS) is an important component of Computerized Adaptive Testing (CAT). Its performance directly affects the security, efficiency and precision of the test. Thus, ISS becomes one of the central issues in CATs based on the Graded Response Model (GRM). It is well known that the goal of IIS is to administer the next unused item remaining in the item bank that best fits the examinees current ability estimate. In dichotomous IRT models, every item has only one difficulty parameter and the item whose difficulty matches the examinee{\textquoteright}s current ability estimate is considered to be the best fitting item. However, in GRM, each item has more than two ordered categories and has no single value to represent the item difficulty. Consequently, some researchers have used to employ the average or the median difficulty value across categories as the difficulty estimate for the item. Using the average value and the median value in effect introduced two corresponding ISSs. In this study, we used computer simulation compare four ISSs based on GRM. We also discussed the effect of "shadow pool" on the uniformity of pool usage as well as the influence of different item parameter distributions and different ability estimation methods on the evaluation criteria of CAT. In the simulation process, Monte Carlo method was adopted to simulate the entire CAT process; 1,000 examinees drawn from standard normal distribution and four 1,000-sized item pools of different item parameter distributions were also simulated. The assumption of the simulation is that a polytomous item is comprised of six ordered categories. In addition, ability estimates were derived using two methods. They were expected a posteriori Bayesian (EAP) and maximum likelihood estimation (MLE). In MLE, the Newton-Raphson iteration method and the Fisher Score iteration method were employed, respectively, to solve the likelihood equation. Moreover, the CAT process was simulated with each examinee 30 times to eliminate random error. The IISs were evaluated by four indices usually used in CAT from four aspects--the accuracy of ability estimation, the stability of IIS, the usage of item pool, and the test efficiency. Simulation results showed adequate evaluation of the ISS that matched the estimate of an examinee{\textquoteright}s current trait level with the difficulty values across categories. Setting "shadow pool" in ISS was able to improve the uniformity of pool utilization. Finally, different distributions of the item parameter and different ability estimation methods affected the evaluation indices of CAT. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {computerized adaptive testing, item selection strategy}, isbn = {0439-755X (Print)}, author = {Ping, Chen and Shuliang, Ding and Haijing, Lin and Jie, Zhou} } @article {16, title = {Maximum information stratification method for controlling item exposure in computerized adaptive testing}, journal = {Psicothema}, volume = {18}, number = {1}, year = {2006}, note = {Barrada, Juan RamonMazuela, PalomaOlea, JulioResearch Support, Non-U.S. Gov{\textquoteright}tSpainPsicothemaPsicothema. 2006 Feb;18(1):156-9.}, month = {Feb}, pages = {156-159}, edition = {2007/02/14}, abstract = {The proposal for increasing the security in Computerized Adaptive Tests that has received most attention in recent years is the a-stratified method (AS - Chang and Ying, 1999): at the beginning of the test only items with low discrimination parameters (a) can be administered, with the values of the a parameters increasing as the test goes on. With this method, distribution of the exposure rates of the items is less skewed, while efficiency is maintained in trait-level estimation. The pseudo-guessing parameter (c), present in the three-parameter logistic model, is considered irrelevant, and is not used in the AS method. The Maximum Information Stratified (MIS) model incorporates the c parameter in the stratification of the bank and in the item-selection rule, improving accuracy by comparison with the AS, for item banks with a and b parameters correlated and uncorrelated. For both kinds of banks, the blocking b methods (Chang, Qian and Ying, 2001) improve the security of the item bank.M{\'e}todo de estratificaci{\'o}n por m{\'a}xima informaci{\'o}n para el control de la exposici{\'o}n en tests adaptativos informatizados. La propuesta para aumentar la seguridad en los tests adaptativos informatizados que ha recibido m{\'a}s atenci{\'o}n en los {\'u}ltimos a{\~n}os ha sido el m{\'e}todo a-estratificado (AE - Chang y Ying, 1999): en los momentos iniciales del test s{\'o}lo pueden administrarse {\'\i}tems con bajos par{\'a}metros de discriminaci{\'o}n (a), increment{\'a}ndose los valores del par{\'a}metro a admisibles seg{\'u}n avanza el test. Con este m{\'e}todo la distribuci{\'o}n de las tasas de exposici{\'o}n de los {\'\i}tems es m{\'a}s equilibrada, manteniendo una adecuada precisi{\'o}n en la medida. El par{\'a}metro de pseudoadivinaci{\'o}n (c), presente en el modelo log{\'\i}stico de tres par{\'a}metros, se supone irrelevante y no se incorpora en el AE. El m{\'e}todo de Estratificaci{\'o}n por M{\'a}xima Informaci{\'o}n (EMI) incorpora el par{\'a}metro c a la estratificaci{\'o}n del banco y a la regla de selecci{\'o}n de {\'\i}tems, mejorando la precisi{\'o}n en comparaci{\'o}n con AE, tanto para bancos donde los par{\'a}metros a y b correlacionan como para bancos donde no. Para ambos tipos de bancos, los m{\'e}todos de bloqueo de b (Chang, Qian y Ying, 2001) mejoran la seguridad del banco.}, keywords = {*Artificial Intelligence, *Microcomputers, *Psychological Tests, *Software Design, Algorithms, Chi-Square Distribution, Humans, Likelihood Functions}, isbn = {0214-9915 (Print)}, author = {Barrada, J and Mazuela, P. and Olea, J.} } @article {174, title = {Measurement precision and efficiency of multidimensional computer adaptive testing of physical functioning using the pediatric evaluation of disability inventory}, journal = {Archives of Physical Medicine and Rehabilitation}, volume = {87}, number = {9}, year = {2006}, note = {Haley, Stephen MNi, PengshengLudlow, Larry HFragala-Pinkham, Maria AK02 hd45354-01/hd/nichdResearch Support, N.I.H., ExtramuralResearch Support, Non-U.S. Gov{\textquoteright}tUnited StatesArchives of physical medicine and rehabilitationArch Phys Med Rehabil. 2006 Sep;87(9):1223-9.}, month = {Sep}, pages = {1223-9}, edition = {2006/08/29}, abstract = {OBJECTIVE: To compare the measurement efficiency and precision of a multidimensional computer adaptive testing (M-CAT) application to a unidimensional CAT (U-CAT) comparison using item bank data from 2 of the functional skills scales of the Pediatric Evaluation of Disability Inventory (PEDI). DESIGN: Using existing PEDI mobility and self-care item banks, we compared the stability of item calibrations and model fit between unidimensional and multidimensional Rasch models and compared the efficiency and precision of the U-CAT- and M-CAT-simulated assessments to a random draw of items. SETTING: Pediatric rehabilitation hospital and clinics. PARTICIPANTS: Clinical and normative samples. INTERVENTIONS: Not applicable. MAIN OUTCOME MEASURES: Not applicable. RESULTS: The M-CAT had greater levels of precision and efficiency than the separate mobility and self-care U-CAT versions when using a similar number of items for each PEDI subdomain. Equivalent estimation of mobility and self-care scores can be achieved with a 25\% to 40\% item reduction with the M-CAT compared with the U-CAT. CONCLUSIONS: M-CAT applications appear to have both precision and efficiency advantages compared with separate U-CAT assessments when content subdomains have a high correlation. Practitioners may also realize interpretive advantages of reporting test score information for each subdomain when separate clinical inferences are desired.}, keywords = {*Disability Evaluation, *Pediatrics, Adolescent, Child, Child, Preschool, Computers, Disabled Persons/*classification/rehabilitation, Efficiency, Humans, Infant, Outcome Assessment (Health Care), Psychometrics, Self Care}, isbn = {0003-9993 (Print)}, author = {Haley, S. M. and Ni, P. and Ludlow, L. H. and Fragala-Pinkham, M. A.} } @article {181, title = {Optimal and nonoptimal computer-based test designs for making pass-fail decisions}, journal = {Applied Measurement in Education}, volume = {19}, number = {3}, year = {2006}, pages = {221-239}, publisher = {Lawrence Erlbaum: US}, abstract = {Now that many credentialing exams are being routinely administered by computer, new computer-based test designs, along with item response theory models, are being aggressively researched to identify specific designs that can increase the decision consistency and accuracy of pass-fail decisions. The purpose of this study was to investigate the impact of optimal and nonoptimal multistage test (MST) designs, linear parallel-form test designs (LPFT), and computer adaptive test (CAT) designs on the decision consistency and accuracy of pass-fail decisions. Realistic testing situations matching those of one of the large credentialing agencies were simulated to increase the generalizability of the findings. The conclusions were clear: (a) With the LPFTs, matching test information functions (TIFs) to the mean of the proficiency distribution produced slightly better results than matching them to the passing score; (b) all of the test designs worked better than test construction using random selection of items, subject to content constraints only; (c) CAT performed better than the other test designs; and (d) if matching a TIP to the passing score, the MST design produced a bit better results than the LPFT design. If an argument for the MST design is to be made, it can be made on the basis of slight improvements over the LPFT design and better expected item bank utilization, candidate preference, and the potential for improved diagnostic feedback, compared with the feedback that is possible with fixed linear test forms. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {adaptive test, credentialing exams, Decision Making, Educational Measurement, multistage tests, optimal computer-based test designs, test form}, isbn = {0895-7347 (Print); 1532-4818 (Electronic)}, author = {Hambleton, R. K. and Xing, D.} } @article {116, title = {Optimal testing with easy or difficult items in computerized adaptive testing}, journal = {Applied Psychological Measurement}, volume = {30}, number = {5}, year = {2006}, pages = {379-393}, publisher = {Sage Publications: US}, abstract = {Computerized adaptive tests (CATs) are individualized tests that, from a measurement point of view, are optimal for each individual, possibly under some practical conditions. In the present study, it is shown that maximum information item selection in CATs using an item bank that is calibrated with the one- or the two-parameter logistic model results in each individual answering about 50\% of the items correctly. Two item selection procedures giving easier (or more difficult) tests for students are presented and evaluated. Item selection on probability points of items yields good results only with the one-parameter logistic model and not with the two-parameter logistic model. An alternative selection procedure, based on maximum information at a shifted ability level, gives satisfactory results with both models. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {computer adaptive tests, individualized tests, Item Response Theory, item selection, Measurement}, isbn = {0146-6216 (Print)}, author = {Theo Eggen and Verschoor, Angela J.} } @article {319, title = {SIMCAT 1.0: A SAS computer program for simulating computer adaptive testing}, journal = {Applied Psychological Measurement}, volume = {30}, number = {1}, year = {2006}, pages = {60-61}, publisher = {Sage Publications: US}, abstract = {Monte Carlo methodologies are frequently applied to study the sampling distribution of the estimated proficiency level in adaptive testing. These methods eliminate real situational constraints. However, these Monte Carlo methodologies are not currently supported by the available software programs, and when these programs are available, their flexibility is limited. SIMCAT 1.0 is aimed at the simulation of adaptive testing sessions under different adaptive expected a posteriori (EAP) proficiency-level estimation methods (Blais \& Ra{\^\i}che, 2005; Ra{\^\i}che \& Blais, 2005) based on the one-parameter Rasch logistic model. These methods are all adaptive in the a priori proficiency-level estimation, the proficiency-level estimation bias correction, the integration interval, or a combination of these factors. The use of these adaptive EAP estimation methods diminishes considerably the shrinking, and therefore biasing, effect of the estimated a priori proficiency level encountered when this a priori is fixed at a constant value independently of the computed previous value of the proficiency level. SIMCAT 1.0 also computes empirical and estimated skewness and kurtosis coefficients, such as the standard error, of the estimated proficiency-level sampling distribution. In this way, the program allows one to compare empirical and estimated properties of the estimated proficiency-level sampling distribution under different variations of the EAP estimation method: standard error and bias, like the skewness and kurtosis coefficients. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {computer adaptive testing, computer program, estimated proficiency level, Monte Carlo methodologies, Rasch logistic model}, isbn = {0146-6216 (Print)}, author = {Ra{\^\i}che, G. and Blais, J-G.} } @article {2073, title = {Simulated computerized adaptive test for patients with lumbar spine impairments was efficient and produced valid measures of function}, journal = {Journal of Clinical Epidemiology}, volume = {59}, year = {2006}, pages = {947{\textendash}956}, abstract = {Objective: To equate physical functioning (PF) items with Back Pain Functional Scale (BPFS) items, develop a computerized adaptive test (CAT) designed to assess lumbar spine functional status (LFS) in people with lumbar spine impairments, and compare discriminant validity of LFS measures (qIRT) generated using all items analyzed with a rating scale Item Response Theory model (RSM) and measures generated using the simulated CAT (qCAT). Methods: We performed a secondary analysis of retrospective intake rehabilitation data. Results: Unidimensionality and local independence of 25 BPFS and PF items were supported. Differential item functioning was negligible for levels of symptom acuity, gender, age, and surgical history. The RSM fit the data well. A lumbar spine specific CAT was developed that was 72\% more efficient than using all 25 items to estimate LFS measures. qIRT and qCAT measures did not discriminate patients by symptom acuity, age, or gender, but discriminated patients by surgical history in similar clinically logical ways. qCAT measures were as precise as qIRT measures. Conclusion: A body part specific simulated CAT developed from an LFS item bank was efficient and produced precise measures of LFS without eroding discriminant validity.}, keywords = {Back Pain Functional Scale, computerized adaptive testing, Item Response Theory, Lumbar spine, Rehabilitation, True-score equating}, doi = {10.1016/j.jclinepi.2005.10.017}, author = {Hart, D. L. and Mioduski, J. E. and Werneke, M. W. and Stratford, P. W.} } @article {2074, title = {Simulated computerized adaptive test for patients with shoulder impairments was efficient and produced valid measures of function}, journal = {Journal of Clinical Epidemiology}, volume = {59}, year = {2006}, pages = {290-298}, abstract = {

Background and Objective: To test unidimensionality and local independence of a set of shoulder functional status (SFS) items,
develop a computerized adaptive test (CAT) of the items using a rating scale item response theory model (RSM), and compare discriminant validity of measures generated using all items (qIRT) and measures generated using the simulated CAT (qCAT).
Study Design and Setting: We performed a secondary analysis of data collected prospectively during rehabilitation of 400 patients
with shoulder impairments who completed 60 SFS items.
Results: Factor analytic techniques supported that the 42 SFS items formed a unidimensional scale and were locally independent. Except for five items, which were deleted, the RSM fit the data well. The remaining 37 SFS items were used to generate the CAT. On average, 6 items on were needed to estimate precise measures of function using the SFS CAT, compared with all 37 SFS items. The qIRT and qCAT measures were highly correlated (r 5 .96) and resulted in similar classifications of patients.
Conclusion: The simulated SFS CAT was efficient and produced precise, clinically relevant measures of functional status with good
discriminating ability.\ 

}, keywords = {computerized adaptive testing, Flexilevel Scale of Shoulder Function, Item Response Theory, Rehabilitation}, author = {Hart, D. L. and Cook, K. F. and Mioduski, J. E. and Teal, C. R. and Crane, P. K.} } @article {296, title = {T{\'e}cnicas para detectar patrones de respuesta at{\'\i}picos [Aberrant patterns detection methods]}, journal = {Anales de Psicolog{\'\i}a}, volume = {22}, number = {1}, year = {2006}, note = {Spain: Universidad de Murcia}, pages = {143-154}, abstract = {La identificaci{\'o}n de patrones de respuesta at{\'\i}picos es de gran utilidad para la construcci{\'o}n de tests y de bancos de {\'\i}tems con propiedades psicom{\'e}tricas as{\'\i} como para el an{\'a}lisis de validez de los mismos. En este trabajo de revisi{\'o}n se han recogido los m{\'a}s relevantes y novedosos m{\'e}todos de ajuste de personas que se han elaborado dentro de cada uno de los principales {\'a}mbitos de trabajo de la Psicometr{\'\i}a: el escalograma de Guttman, la Teor{\'\i}a Cl{\'a}sica de Tests (TCT), la Teor{\'\i}a de la Generalizabilidad (TG), la Teor{\'\i}a de Respuesta al {\'I}tem (TRI), los Modelos de Respuesta al {\'I}tem No Param{\'e}tricos (MRINP), los Modelos de Clase Latente de Orden Restringido (MCL-OR) y el An{\'a}lisis de Estructura de Covarianzas (AEC).Aberrant patterns detection has a great usefulness in order to make tests and item banks with psychometric characteristics and validity analysis of tests and items. The most relevant and newest person-fit methods have been reviewed. All of them have been made in each one of main areas of Psychometry: Guttman{\textquoteright}s scalogram, Classical Test Theory (CTT), Generalizability Theory (GT), Item Response Theory (IRT), Non-parametric Response Models (NPRM), Order-Restricted Latent Class Models (OR-LCM) and Covariance Structure Analysis (CSA).}, keywords = {aberrant patterns detection, Classical Test Theory, generalizability theory, Item Response, Item Response Theory, Mathematics, methods, person-fit, Psychometrics, psychometry, Test Validity, test validity analysis, Theory}, isbn = {0212-9728}, author = {N{\'u}{\~n}ez, R. M. N. and Pina, J. A. L.} } @inbook {180, title = {Applications of item response theory to improve health outcomes assessment: Developing item banks, linking instruments, and computer-adaptive testing}, booktitle = {Outcomes assessment in cancer}, year = {2005}, note = {Using Smart Source ParsingOutcomes assessment in cancer: Measures, methods, and applications. (pp. 445-464). New York, NY : Cambridge University Press. xiv, 662 pp}, pages = {445-464}, publisher = {Cambridge University Press}, organization = {Cambridge University Press}, address = {Cambridge, UK}, abstract = {(From the chapter) The current chapter builds on Reise{\textquoteright}s introduction to the basic concepts, assumptions, popular models, and important features of IRT and discusses the applications of item response theory (IRT) modeling to health outcomes assessment. In particular, we highlight the critical role of IRT modeling in: developing an instrument to match a study{\textquoteright}s population; linking two or more instruments measuring similar constructs on a common metric; and creating item banks that provide the foundation for tailored short-form instruments or for computerized adaptive assessments. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Computer Assisted Testing, Health, Item Response Theory, Measurement, Test Construction, Treatment Outcomes}, author = {Hambleton, R. K.}, editor = {C. C. Gotay and C. Snyder} } @article {175, title = {Assessing mobility in children using a computer adaptive testing version of the pediatric evaluation of disability inventory}, journal = {Archives of Physical Medicine and Rehabilitation}, volume = {86}, number = {5}, year = {2005}, note = {Haley, Stephen MRaczek, Anastasia ECoster, Wendy JDumas, Helene MFragala-Pinkham, Maria AK02 hd45354-01a1/hd/nichdR43 hd42388-01/hd/nichdResearch Support, N.I.H., ExtramuralResearch Support, U.S. Gov{\textquoteright}t, P.H.S.United StatesArchives of physical medicine and rehabilitationArch Phys Med Rehabil. 2005 May;86(5):932-9.}, month = {May}, pages = {932-9}, edition = {2005/05/17}, abstract = {OBJECTIVE: To assess score agreement, validity, precision, and response burden of a prototype computerized adaptive testing (CAT) version of the Mobility Functional Skills Scale (Mob-CAT) of the Pediatric Evaluation of Disability Inventory (PEDI) as compared with the full 59-item version (Mob-59). DESIGN: Computer simulation analysis of cross-sectional and longitudinal retrospective data; and cross-sectional prospective study. SETTING: Pediatric rehabilitation hospital, including inpatient acute rehabilitation, day school program, outpatient clinics, community-based day care, preschool, and children{\textquoteright}s homes. PARTICIPANTS: Four hundred sixty-nine children with disabilities and 412 children with no disabilities (analytic sample); 41 children without disabilities and 39 with disabilities (cross-validation sample). INTERVENTIONS: Not applicable. MAIN OUTCOME MEASURES: Summary scores from a prototype Mob-CAT application and versions using 15-, 10-, and 5-item stopping rules; scores from the Mob-59; and number of items and time (in seconds) to administer assessments. RESULTS: Mob-CAT scores from both computer simulations (intraclass correlation coefficient [ICC] range, .94-.99) and field administrations (ICC=.98) were in high agreement with scores from the Mob-59. Using computer simulations of retrospective data, discriminant validity, and sensitivity to change of the Mob-CAT closely approximated that of the Mob-59, especially when using the 15- and 10-item stopping rule versions of the Mob-CAT. The Mob-CAT used no more than 15\% of the items for any single administration, and required 20\% of the time needed to administer the Mob-59. CONCLUSIONS: Comparable score estimates for the PEDI mobility scale can be obtained from CAT administrations, with losses in validity and precision for shorter forms, but with a considerable reduction in administration time.}, keywords = {*Computer Simulation, *Disability Evaluation, Adolescent, Child, Child, Preschool, Cross-Sectional Studies, Disabled Children/*rehabilitation, Female, Humans, Infant, Male, Outcome Assessment (Health Care)/*methods, Rehabilitation Centers, Rehabilitation/*standards, Sensitivity and Specificity}, isbn = {0003-9993 (Print)}, author = {Haley, S. M. and Raczek, A. E. and Coster, W. J. and Dumas, H. M. and Fragala-Pinkham, M. A.} } @article {102, title = {A Bayesian student model without hidden nodes and its comparison with item response theory}, journal = {International Journal of Artificial Intelligence in Education}, volume = {15}, number = {4}, year = {2005}, pages = {291-323}, publisher = {IOS Press: Netherlands}, abstract = {The Bayesian framework offers a number of techniques for inferring an individual{\textquoteright}s knowledge state from evidence of mastery of concepts or skills. A typical application where such a technique can be useful is Computer Adaptive Testing (CAT). A Bayesian modeling scheme, POKS, is proposed and compared to the traditional Item Response Theory (IRT), which has been the prevalent CAT approach for the last three decades. POKS is based on the theory of knowledge spaces and constructs item-to-item graph structures without hidden nodes. It aims to offer an effective knowledge assessment method with an efficient algorithm for learning the graph structure from data. We review the different Bayesian approaches to modeling student ability assessment and discuss how POKS relates to them. The performance of POKS is compared to the IRT two parameter logistic model. Experimental results over a 34 item Unix test and a 160 item French language test show that both approaches can classify examinees as master or non-master effectively and efficiently, with relatively comparable performance. However, more significant differences are found in favor of POKS for a second task that consists in predicting individual question item outcome. Implications of these results for adaptive testing and student modeling are discussed, as well as the limitations and advantages of POKS, namely the issue of integrating concepts into its structure. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {Bayesian Student Model, computer adaptive testing, hidden nodes, Item Response Theory}, isbn = {1560-4292 (Print); 1560-4306 (Electronic)}, author = {Desmarais, M. C. and Pu, X.} } @article {398, title = {A comparison of item-selection methods for adaptive tests with content constraints}, journal = {Journal of Educational Measurement}, volume = {42}, number = {3}, year = {2005}, pages = {283-302}, publisher = {Blackwell Publishing: United Kingdom}, abstract = {In test assembly, a fundamental difference exists between algorithms that select a test sequentially or simultaneously. Sequential assembly allows us to optimize an objective function at the examinee{\textquoteright}s ability estimate, such as the test information function in computerized adaptive testing. But it leads to the non-trivial problem of how to realize a set of content constraints on the test{\textemdash}a problem more naturally solved by a simultaneous item-selection method. Three main item-selection methods in adaptive testing offer solutions to this dilemma. The spiraling method moves item selection across categories of items in the pool proportionally to the numbers needed from them. Item selection by the weighted-deviations method (WDM) and the shadow test approach (STA) is based on projections of the future consequences of selecting an item. These two methods differ in that the former calculates a projection of a weighted sum of the attributes of the eventual test and the latter a projection of the test itself. The pros and cons of these methods are analyzed. An empirical comparison between the WDM and STA was conducted for an adaptive version of the Law School Admission Test (LSAT), which showed equally good item-exposure rates but violations of some of the constraints and larger bias and inaccuracy of the ability estimator for the WDM.}, keywords = {Adaptive Testing, Algorithms, content constraints, item selection method, shadow test approach, spiraling method, weighted deviations method}, isbn = {0022-0655 (Print)}, author = {van der Linden, W. J.} } @article {150, title = {Computer adaptive testing}, journal = {Journal of Applied Measurement}, volume = {6}, number = {1}, year = {2005}, note = {Gershon, Richard CReviewUnited StatesJournal of applied measurementJ Appl Meas. 2005;6(1):109-27.}, pages = {109-27}, edition = {2005/02/11}, abstract = {The creation of item response theory (IRT) and Rasch models, inexpensive accessibility to high speed desktop computers, and the growth of the Internet, has led to the creation and growth of computerized adaptive testing or CAT. This form of assessment is applicable for both high stakes tests such as certification or licensure exams, as well as health related quality of life surveys. This article discusses the historical background of CAT including its many advantages over conventional (typically paper and pencil) alternatives. The process of CAT is then described including descriptions of the specific differences of using CAT based upon 1-, 2- and 3-parameter IRT and various Rasch models. Numerous specific topics describing CAT in practice are described including: initial item selection, content balancing, test difficulty, test length and stopping rules. The article concludes with the author{\textquoteright}s reflections regarding the future of CAT.}, keywords = {*Internet, *Models, Statistical, *User-Computer Interface, Certification, Health Surveys, Humans, Licensure, Microcomputers, Quality of Life}, isbn = {1529-7713 (Print)}, author = {Gershon, R. C.} } @article {171, title = {A computer adaptive testing approach for assessing physical functioning in children and adolescents}, journal = {Developmental Medicine and Child Neuropsychology}, volume = {47}, number = {2}, year = {2005}, note = {Haley, Stephen MNi, PengshengFragala-Pinkham, Maria ASkrinar, Alison MCorzo, DeyaniraComparative StudyResearch Support, Non-U.S. Gov{\textquoteright}tEnglandDevelopmental medicine and child neurologyDev Med Child Neurol. 2005 Feb;47(2):113-20.}, month = {Feb}, pages = {113-120}, edition = {2005/02/15}, abstract = {The purpose of this article is to demonstrate: (1) the accuracy and (2) the reduction in amount of time and effort in assessing physical functioning (self-care and mobility domains) of children and adolescents using computer-adaptive testing (CAT). A CAT algorithm selects questions directly tailored to the child{\textquoteright}s ability level, based on previous responses. Using a CAT algorithm, a simulation study was used to determine the number of items necessary to approximate the score of a full-length assessment. We built simulated CAT (5-, 10-, 15-, and 20-item versions) for self-care and mobility domains and tested their accuracy in a normative sample (n=373; 190 males, 183 females; mean age 6y 11mo [SD 4y 2m], range 4mo to 14y 11mo) and a sample of children and adolescents with Pompe disease (n=26; 21 males, 5 females; mean age 6y 1mo [SD 3y 10mo], range 5mo to 14y 10mo). Results indicated that comparable score estimates (based on computer simulations) to the full-length tests can be achieved in a 20-item CAT version for all age ranges and for normative and clinical samples. No more than 13 to 16\% of the items in the full-length tests were needed for any one administration. These results support further consideration of using CAT programs for accurate and efficient clinical assessments of physical functioning.}, keywords = {*Computer Systems, Activities of Daily Living, Adolescent, Age Factors, Child, Child Development/*physiology, Child, Preschool, Computer Simulation, Confidence Intervals, Demography, Female, Glycogen Storage Disease Type II/physiopathology, Health Status Indicators, Humans, Infant, Infant, Newborn, Male, Motor Activity/*physiology, Outcome Assessment (Health Care)/*methods, Reproducibility of Results, Self Care, Sensitivity and Specificity}, isbn = {0012-1622 (Print)}, author = {Haley, S. M. and Ni, P. and Fragala-Pinkham, M. A. and Skrinar, A. M. and Corzo, D.} } @article {192, title = {A computer-assisted test design and diagnosis system for use by classroom teachers}, journal = {Journal of Computer Assisted Learning}, volume = {21}, number = {6}, year = {2005}, pages = {419-429}, abstract = {Computer-assisted assessment (CAA) has become increasingly important in education in recent years. A variety of computer software systems have been developed to help assess the performance of students at various levels. However, such systems are primarily designed to provide objective assessment of students and analysis of test items, and focus has been mainly placed on higher and further education. Although there are commercial professional systems available for use by primary and secondary educational institutions, such systems are generally expensive and require skilled expertise to operate. In view of the rapid progress made in the use of computer-based assessment for primary and secondary students by education authorities here in the UK and elsewhere, there is a need to develop systems which are economic and easy to use and can provide the necessary information that can help teachers improve students{\textquoteright} performance. This paper presents the development of a software system that provides a range of functions including generating items and building item banks, designing tests, conducting tests on computers and analysing test results. Specifically, the system can generate information on the performance of students and test items that can be easily used to identify curriculum areas where students are under performing. A case study based on data collected from five secondary schools in Hong Kong involved in the Curriculum, Evaluation and Management Centre{\textquoteright}s Middle Years Information System Project, Durham University, UK, has been undertaken to demonstrate the use of the system for diagnostic and performance analysis. (PsycINFO Database Record (c) 2006 APA ) (journal abstract)}, keywords = {Computer Assisted Testing, Computer Software, Diagnosis, Educational Measurement, Teachers}, author = {He, Q. and Tymms, P.} } @article {72, title = {Controlling item exposure and test overlap in computerized adaptive testing}, journal = {Applied Psychological Measurement}, volume = {29}, number = {3}, year = {2005}, pages = {204-217}, abstract = {This article proposes an item exposure control method, which is the extension of the Sympson and Hetter procedure and can provide item exposure control at both the item and test levels. Item exposure rate and test overlap rate are two indices commonly used to track item exposure in computerized adaptive tests. By considering both indices, item exposure can be monitored at both the item and test levels. To control the item exposure rate and test overlap rate simultaneously, the modified procedure attempted to control not only the maximum value but also the variance of item exposure rates. Results indicated that the item exposure rate and test overlap rate could be controlled simultaneously by implementing the modified procedure. Item exposure control was improved and precision of trait estimation decreased when a prespecified maximum test overlap rate was stringent. (PsycINFO Database Record (c) 2005 APA ) (journal abstract)}, keywords = {Adaptive Testing, Computer Assisted Testing, Item Content (Test) computerized adaptive testing}, author = {Chen, S-Y. and Lei, P-W.} } @article {85, title = {Dynamic assessment of health outcomes: Time to let the CAT out of the bag?}, journal = {Health Services Research}, volume = {40}, number = {5, part2}, year = {2005}, pages = {1694-1711}, publisher = {Blackwell Publishing: United Kingdom}, abstract = {Background: The use of item response theory (IRT) to measure self-reported outcomes has burgeoned in recent years. Perhaps the most important application of IRT is computer-adaptive testing (CAT), a measurement approach in which the selection of items is tailored for each respondent. Objective. To provide an introduction to the use of CAT in the measurement of health outcomes, describe several IRT models that can be used as the basis of CAT, and discuss practical issues associated with the use of adaptive scaling in research settings. Principal Points: The development of a CAT requires several steps that are not required in the development of a traditional measure including identification of "starting" and "stopping" rules. CAT{\textquoteright}s most attractive advantage is its efficiency. Greater measurement precision can be achieved with fewer items. Disadvantages of CAT include the high cost and level of technical expertise required to develop a CAT. Conclusions: Researchers, clinicians, and patients benefit from the availability of psychometrically rigorous measures that are not burdensome. CAT outcome measures hold substantial promise in this regard, but their development is not without challenges. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {computer adaptive testing, Item Response Theory, self reported health outcomes}, isbn = {0017-9124 (Print); 1475-6773 (Electronic)}, author = {Cook, K. F. and O{\textquoteright}Malley, K. J. and Roddey, T. S.} } @article {253, title = {Increasing the homogeneity of CAT{\textquoteright}s item-exposure rates by minimizing or maximizing varied target functions while assembling shadow tests}, journal = {Journal of Educational Measurement}, volume = {42}, number = {3}, year = {2005}, pages = {245-269}, publisher = {Blackwell Publishing: United Kingdom}, abstract = {A computerized adaptive testing (CAT) algorithm that has the potential to increase the homogeneity of CATs item-exposure rates without significantly sacrificing the precision of ability estimates was proposed and assessed in the shadow-test (van der Linden \& Reese, 1998) CAT context. This CAT algorithm was formed by a combination of maximizing or minimizing varied target functions while assembling shadow tests. There were four target functions to be separately used in the first, second, third, and fourth quarter test of CAT. The elements to be used in the four functions were associated with (a) a random number assigned to each item, (b) the absolute difference between an examinee{\textquoteright}s current ability estimate and an item difficulty, (c) the absolute difference between an examinee{\textquoteright}s current ability estimate and an optimum item difficulty, and (d) item information. The results indicated that this combined CAT fully utilized all the items in the pool, reduced the maximum exposure rates, and achieved more homogeneous exposure rates. Moreover, its precision in recovering ability estimates was similar to that of the maximum item-information method. The combined CAT method resulted in the best overall results compared with the other individual CAT item-selection methods. The findings from the combined CAT are encouraging. Future uses are discussed. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {algorithm, computerized adaptive testing, item exposure rate, shadow test, varied target function}, isbn = {0022-0655 (Print)}, author = {Li, Y. H. and Schafer, W. D.} } @article {240, title = {An item response theory-based pain item bank can enhance measurement precision}, journal = {Journal of Pain and Symptom Management}, volume = {30}, number = {3}, year = {2005}, note = {0885-3924Journal Article}, pages = {278-88}, abstract = {Cancer-related pain is often under-recognized and undertreated. This is partly due to the lack of appropriate assessments, which need to be comprehensive and precise yet easily integrated into clinics. Computerized adaptive testing (CAT) can enable precise-yet-brief assessments by only selecting the most informative items from a calibrated item bank. The purpose of this study was to create such a bank. The sample included 400 cancer patients who were asked to complete 61 pain-related items. Data were analyzed using factor analysis and the Rasch model. The final bank consisted of 43 items which satisfied the measurement requirement of factor analysis and the Rasch model, demonstrated high internal consistency and reasonable item-total correlations, and discriminated patients with differing degrees of pain. We conclude that this bank demonstrates good psychometric properties, is sensitive to pain reported by patients, and can be used as the foundation for a CAT pain-testing platform for use in clinical practice.}, keywords = {computerized adaptive testing}, author = {Lai, J-S. and Dineen, K. and Reeve, B. B. and Von Roenn, J. and Shervin, D. and McGuire, M. and Bode, R. K. and Paice, J. and Cella, D.} } @article {357, title = {Measuring physical function in patients with complex medical and postsurgical conditions: a computer adaptive approach}, journal = {American Journal of Physical Medicine and Rehabilitation}, volume = {84}, number = {10}, year = {2005}, note = {0894-9115 (Print)Comparative StudyJournal ArticleResearch Support, N.I.H., ExtramuralResearch Support, U.S. Gov{\textquoteright}t, P.H.S.}, month = {Oct}, pages = {741-8}, abstract = {OBJECTIVE: To examine whether the range of disability in the medically complex and postsurgical populations receiving rehabilitation is adequately sampled by the new Activity Measure--Post-Acute Care (AM-PAC), and to assess whether computer adaptive testing (CAT) can derive valid patient scores using fewer questions. DESIGN: Observational study of 158 subjects (mean age 67.2 yrs) receiving skilled rehabilitation services in inpatient (acute rehabilitation hospitals, skilled nursing facility units) and community (home health services, outpatient departments) settings for recent-onset or worsening disability from medical (excluding neurological) and surgical (excluding orthopedic) conditions. Measures were interviewer-administered activity questions (all patients) and physical functioning portion of the SF-36 (outpatients) and standardized chart items (11 Functional Independence Measure (FIM), 19 Standardized Outcome and Assessment Information Set (OASIS) items, and 22 Minimum Data Set (MDS) items). Rasch modeling analyzed all data and the relationship between person ability estimates and average item difficulty. CAT assessed the ability to derive accurate patient scores using a sample of questions. RESULTS: The 163-item activity item pool covered the range of physical movement and personal and instrumental activities. CAT analysis showed comparable scores between estimates using 10 items or the total item pool. CONCLUSION: The AM-PAC can assess a broad range of function in patients with complex medical illness. CAT achieves valid patient scores using fewer questions.}, keywords = {Activities of Daily Living/*classification, Adult, Aged, Cohort Studies, Continuity of Patient Care, Disability Evaluation, Female, Health Services Research, Humans, Male, Middle Aged, Postoperative Care/*rehabilitation, Prognosis, Recovery of Function, Rehabilitation Centers, Rehabilitation/*standards, Sensitivity and Specificity, Sickness Impact Profile, Treatment Outcome}, author = {Siebens, H. and Andres, P. L. and Pengsheng, N. and Coster, W. J. and Haley, S. M.} } @article {142, title = {The promise of PROMIS: using item response theory to improve assessment of patient-reported outcomes}, journal = {Clinical and Experimental Rheumatology}, volume = {23}, number = {5 Suppl 39}, year = {2005}, pages = {S53-7}, abstract = {PROMIS (Patient-Reported-Outcomes Measurement Information System) is an NIH Roadmap network project intended to improve the reliability, validity, and precision of PROs and to provide definitive new instruments that will exceed the capabilities of classic instruments and enable improved outcome measurement for clinical research across all NIH institutes. Item response theory (IRT) measurement models now permit us to transition conventional health status assessment into an era of item banking and computerized adaptive testing (CAT). Item banking uses IRT measurement models and methods to develop item banks from large pools of items from many available questionnaires. IRT allows the reduction and improvement of items and assembles domains of items which are unidimensional and not excessively redundant. CAT provides a model-driven algorithm and software to iteratively select the most informative remaining item in a domain until a desired degree of precision is obtained. Through these approaches the number of patients required for a clinical trial may be reduced while holding statistical power constant. PROMIS tools, expected to improve precision and enable assessment at the individual patient level which should broaden the appeal of PROs, will begin to be available to the general medical community in 2008.}, keywords = {computerized adaptive testing}, author = {Fries, J.F. and Bruce, B. and Cella, D.} } @article {4, title = {Propiedades psicom{\'e}tricas de un test Adaptativo Informatizado para la medici{\'o}n del ajuste emocional [Psychometric properties of an Emotional Adjustment Computerized Adaptive Test]}, journal = {Psicothema}, volume = {17}, number = {3}, year = {2005}, pages = {484-491}, abstract = {En el presente trabajo se describen las propiedades psicom{\'e}tricas de un Test Adaptativo Informatizado para la medici{\'o}n del ajuste emocional de las personas. La revisi{\'o}n de la literatura acerca de la aplicaci{\'o}n de los modelos de la teor{\'\i}a de la respuesta a los {\'\i}tems (TRI) muestra que {\'e}sta se ha utilizado m{\'a}s en el trabajo con variables aptitudinales que para la medici{\'o}n de variables de personalidad, sin embargo diversos estudios han mostrado la eficacia de la TRI para la descripci{\'o}n psicom{\'e}trica de dichasvariables. Aun as{\'\i}, pocos trabajos han explorado las caracter{\'\i}sticas de un Test Adaptativo Informatizado, basado en la TRI, para la medici{\'o}n de una variable de personalidad como es el ajuste emocional. Nuestros resultados muestran la eficiencia del TAI para la evaluaci{\'o}n del ajuste emocional, proporcionando una medici{\'o}n v{\'a}lida y precisa, utilizando menor n{\'u}mero de elementos de medida encomparaci{\'o}n con las escalas de ajuste emocional de instrumentos fuertemente implantados. Psychometric properties of an emotional adjustment computerized adaptive test. In the present work it was described the psychometric properties of an emotional adjustment computerized adaptive test. An examination of Item Response Theory (IRT) research literature indicates that IRT has been mainly used for assessing achievements and ability rather than personality factors. Nevertheless last years have shown several studies wich have successfully used IRT to personality assessment instruments. Even so, a few amount of works has inquired the computerized adaptative test features, based on IRT, for the measurement of a personality traits as it{\textquoteright}s the emotional adjustment. Our results show the CAT efficiency for the emotional adjustment assessment so this provides a valid and accurate measurement; by using a less number of items in comparison with the emotional adjustment scales from the most strongly established questionnaires.}, keywords = {Computer Assisted Testing, Emotional Adjustment, Item Response, Personality Measures, Psychometrics, Test Validity, Theory}, author = {Aguado, D. and Rubio, V. J. and Hontangas, P. M. and Hern{\'a}ndez, J. M.} } @article {198, title = {A randomized experiment to compare conventional, computerized, and computerized adaptive administration of ordinal polytomous attitude items}, journal = {Applied Psychological Measurement}, volume = {29}, number = {3}, year = {2005}, pages = {159-183}, abstract = {A total of 520 high school students were randomly assigned to a paper-and-pencil test (PPT), a computerized standard test (CST), or a computerized adaptive test (CAT) version of the Dutch School Attitude Questionnaire (SAQ), consisting of ordinal polytomous items. The CST administered items in the same order as the PPT. The CAT administered all items of three SAQ subscales in adaptive order using Samejima{\textquoteright}s graded response model, so that six different stopping rule settings could be applied afterwards. School marks were used as external criteria. Results showed significant but small multivariate administration mode effects on conventional raw scores and small to medium effects on maximum likelihood latent trait estimates. When the precision of CAT latent trait estimates decreased, correlations with grade point average in general decreased. However, the magnitude of the decrease was not very large as compared to the PPT, the CST, and the CAT without the stopping rule. (PsycINFO Database Record (c) 2005 APA ) (journal abstract)}, keywords = {Computer Assisted Testing, Test Administration, Test Items}, author = {Hol, A. M. and Vorst, H. C. M. and Mellenbergh, G. J.} } @article {304, title = {Recent trends in comparability studies}, number = {05-05}, year = {2005}, month = {August, 2005}, institution = {Pearson}, keywords = {computer adaptive testing, Computerized assessment, differential item functioning, Mode effects}, isbn = {05-05}, author = {Paek, P.} } @article {282, title = {Somministrazione di test computerizzati di tipo adattivo: Un{\textquoteright} applicazione del modello di misurazione di Rasch [Administration of computerized and adaptive tests: An application of the Rasch Model]}, journal = {Testing Psicometria Metodologia}, volume = {12}, number = {3}, year = {2005}, pages = {131-149}, abstract = {The aim of the present study is to describe the characteristics of a procedure for administering computerized and adaptive tests (Computer Adaptive Testing or CAT). Items to be asked to the individuals are interactively chosen and are selected from a "bank" in which they were previously calibrated and recorded on the basis of their difficulty level. The selection of items is performed by increasingly more accurate estimates of the examinees{\textquoteright} ability. The building of an item-bank on Psychometrics and the implementation of this procedure allow a first validation through Monte Carlo simulations. (PsycINFO Database Record (c) 2006 APA ) (journal abstract)}, keywords = {Adaptive Testing, Computer Assisted Testing, Item Response Theory computerized adaptive testing, Models, Psychometrics}, author = {Miceli, R. and Molinengo, G.} } @article {195, title = {Test construction for cognitive diagnosis}, journal = {Applied Psychological Measurement}, volume = {29}, number = {4}, year = {2005}, pages = {262-277}, abstract = {Although cognitive diagnostic models (CDMs) can be useful in the analysis and interpretation of existing tests, little has been developed to specify how one might construct a good test using aspects of the CDMs. This article discusses the derivation of a general CDM index based on Kullback-Leibler information that will serve as a measure of how informative an item is for the classification of examinees. The effectiveness of the index is examined for items calibrated using the deterministic input noisy "and" gate model (DINA) and the reparameterized unified model (RUM) by implementing a simple heuristic to construct a test from an item bank. When compared to randomly constructed tests from the same item bank, the heuristic shows significant improvement in classification rates. (PsycINFO Database Record (c) 2005 APA ) (journal abstract)}, keywords = {(Measurement), Cognitive Assessment, Item Analysis (Statistical), Profiles, Test Construction, Test Interpretation, Test Items}, author = {Henson, R. K. and Douglas, J.} } @article {168, title = {Activity outcome measurement for postacute care}, journal = {Medical Care}, volume = {42}, number = {1 Suppl}, year = {2004}, note = {0025-7079Journal ArticleMulticenter Study}, pages = {I49-161}, abstract = {BACKGROUND: Efforts to evaluate the effectiveness of a broad range of postacute care services have been hindered by the lack of conceptually sound and comprehensive measures of outcomes. It is critical to determine a common underlying structure before employing current methods of item equating across outcome instruments for future item banking and computer-adaptive testing applications. OBJECTIVE: To investigate the factor structure, reliability, and scale properties of items underlying the Activity domains of the International Classification of Functioning, Disability and Health (ICF) for use in postacute care outcome measurement. METHODS: We developed a 41-item Activity Measure for Postacute Care (AM-PAC) that assessed an individual{\textquoteright}s execution of discrete daily tasks in his or her own environment across major content domains as defined by the ICF. We evaluated the reliability and discriminant validity of the prototype AM-PAC in 477 individuals in active rehabilitation programs across 4 rehabilitation settings using factor analyses, tests of item scaling, internal consistency reliability analyses, Rasch item response theory modeling, residual component analysis, and modified parallel analysis. RESULTS: Results from an initial exploratory factor analysis produced 3 distinct, interpretable factors that accounted for 72\% of the variance: Applied Cognition (44\%), Personal Care \& Instrumental Activities (19\%), and Physical \& Movement Activities (9\%); these 3 activity factors were verified by a confirmatory factor analysis. Scaling assumptions were met for each factor in the total sample and across diagnostic groups. Internal consistency reliability was high for the total sample (Cronbach alpha = 0.92 to 0.94), and for specific diagnostic groups (Cronbach alpha = 0.90 to 0.95). Rasch scaling, residual factor, differential item functioning, and modified parallel analyses supported the unidimensionality and goodness of fit of each unique activity domain. CONCLUSIONS: This 3-factor model of the AM-PAC can form the conceptual basis for common-item equating and computer-adaptive applications, leading to a comprehensive system of outcome instruments for postacute care settings.}, keywords = {*Self Efficacy, *Sickness Impact Profile, Activities of Daily Living/*classification/psychology, Adult, Aftercare/*standards/statistics \& numerical data, Aged, Boston, Cognition/physiology, Disability Evaluation, Factor Analysis, Statistical, Female, Human, Male, Middle Aged, Movement/physiology, Outcome Assessment (Health Care)/*methods/statistics \& numerical data, Psychometrics, Questionnaires/standards, Rehabilitation/*standards/statistics \& numerical data, Reproducibility of Results, Sensitivity and Specificity, Support, U.S. Gov{\textquoteright}t, Non-P.H.S., Support, U.S. Gov{\textquoteright}t, P.H.S.}, author = {Haley, S. M. and Coster, W. J. and Andres, P. L. and Ludlow, L. H. and Ni, P. and Bond, T. L. and Sinclair, S. J. and Jette, A. M.} } @inbook {322, title = {Adaptive computerized educational systems: A case study}, booktitle = {Evidence-based educational methods}, series = {Educational Psychology Series}, year = {2004}, note = {Using Smart Source ParsingEvidence-based educational methods. A volume in the educational psychology series. (pp. 143-170). San Diego, CA : Elsevier Academic Press, [URL:http://www.academicpress.com]. xxiv, 382 pp}, pages = {143-169}, publisher = {Elsevier Academic Press}, organization = {Elsevier Academic Press}, chapter = {10}, address = {San Diego, CA. USA}, abstract = {(Created by APA) Adaptive instruction describes adjustments typical of one-on-one tutoring as discussed in the college tutorial scenario. So computerized adaptive instruction refers to the use of computer software--almost always incorporating artificially intelligent services--which has been designed to adjust both the presentation of information and the form of questioning to meet the current needs of an individual learner. This chapter describes a system for Internet-delivered adaptive instruction. The author attempts to demonstrate a sharp difference between the teaching that takes place outside of the classroom in universities and the kind that is at least afforded, if not taken advantage of by many, students in a more personalized educational setting such as those in the small liberal arts colleges. The author describes a computer-based technology that allows that gap to be bridged with the advantage of at least having more highly prepared learners sitting in college classrooms. A limited range of emerging research that supports that proposition is cited. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Artificial, Computer Assisted Instruction, Computer Software, Higher Education, Individualized, Instruction, Intelligence, Internet, Undergraduate Education}, author = {Ray, R. D.}, editor = {R. W. Malott} } @article {202, title = {Assisted self-adapted testing: A comparative study}, journal = {European Journal of Psychological Assessment}, volume = {20}, number = {1}, year = {2004}, pages = {2-9}, abstract = {A new type of self-adapted test (S-AT), called Assisted Self-Adapted Test (AS-AT), is presented. It differs from an ordinary S-AT in that prior to selecting the difficulty category, the computer advises examinees on their best difficulty category choice, based on their previous performance. Three tests (computerized adaptive test, AS-AT, and S-AT) were compared regarding both their psychometric (precision and efficiency) and psychological (anxiety) characteristics. Tests were applied in an actual assessment situation, in which test scores determined 20\% of term grades. A sample of 173 high school students participated. Neither differences in posttest anxiety nor ability were obtained. Concerning precision, AS-AT was as precise as CAT, and both revealed more precision than S-AT. It was concluded that AS-AT acted as a CAT concerning precision. Some hints, but not conclusive support, of the psychological similarity between AS-AT and S-AT was also found. (PsycINFO Database Record (c) 2005 APA ) (journal abstract)}, keywords = {Adaptive Testing, Anxiety, Computer Assisted Testing, Psychometrics, Test}, author = {Hontangas, P. and Olea, J. and Ponsoda, V. and Revuelta, J. and Wise, S. L.} } @article {8, title = {Computer adaptive testing: a strategy for monitoring stroke rehabilitation across settings}, journal = {Stroke Rehabilitation}, volume = {11}, number = {2}, year = {2004}, note = {Andres, Patricia LBlack-Schaffer, Randie MNi, PengshengHaley, Stephen MR01 hd43568/hd/nichdEvaluation StudiesResearch Support, U.S. Gov{\textquoteright}t, Non-P.H.S.Research Support, U.S. Gov{\textquoteright}t, P.H.S.United StatesTopics in stroke rehabilitationTop Stroke Rehabil. 2004 Spring;11(2):33-9.}, month = {Spring}, pages = {33-39}, edition = {2004/05/01}, abstract = {Current functional assessment instruments in stroke rehabilitation are often setting-specific and lack precision, breadth, and/or feasibility. Computer adaptive testing (CAT) offers a promising potential solution by providing a quick, yet precise, measure of function that can be used across a broad range of patient abilities and in multiple settings. CAT technology yields a precise score by selecting very few relevant items from a large and diverse item pool based on each individual{\textquoteright}s responses. We demonstrate the potential usefulness of a CAT assessment model with a cross-sectional sample of persons with stroke from multiple rehabilitation settings.}, keywords = {*Computer Simulation, *User-Computer Interface, Adult, Aged, Aged, 80 and over, Cerebrovascular Accident/*rehabilitation, Disabled Persons/*classification, Female, Humans, Male, Middle Aged, Monitoring, Physiologic/methods, Severity of Illness Index, Task Performance and Analysis}, isbn = {1074-9357 (Print)}, author = {Andres, P. L. and Black-Schaffer, R. M. and Ni, P. and Haley, S. M.} } @article {147, title = {Computerized adaptive measurement of depression: A simulation study}, journal = {BMC Psychiatry}, volume = {4}, number = {1}, year = {2004}, pages = {13-23}, abstract = {Background: Efficient, accurate instruments for measuring depression are increasingly importantin clinical practice. We developed a computerized adaptive version of the Beck DepressionInventory (BDI). We examined its efficiency and its usefulness in identifying Major DepressiveEpisodes (MDE) and in measuring depression severity.Methods: Subjects were 744 participants in research studies in which each subject completed boththe BDI and the SCID. In addition, 285 patients completed the Hamilton Depression Rating Scale.Results: The adaptive BDI had an AUC as an indicator of a SCID diagnosis of MDE of 88\%,equivalent to the full BDI. The adaptive BDI asked fewer questions than the full BDI (5.6 versus 21items). The adaptive latent depression score correlated r = .92 with the BDI total score and thelatent depression score correlated more highly with the Hamilton (r = .74) than the BDI total scoredid (r = .70).Conclusions: Adaptive testing for depression may provide greatly increased efficiency withoutloss of accuracy in identifying MDE or in measuring depression severity.}, keywords = {*Computer Simulation, Adult, Algorithms, Area Under Curve, Comparative Study, Depressive Disorder/*diagnosis/epidemiology/psychology, Diagnosis, Computer-Assisted/*methods/statistics \& numerical data, Factor Analysis, Statistical, Female, Humans, Internet, Male, Mass Screening/methods, Patient Selection, Personality Inventory/*statistics \& numerical data, Pilot Projects, Prevalence, Psychiatric Status Rating Scales/*statistics \& numerical data, Psychometrics, Research Support, Non-U.S. Gov{\textquoteright}t, Research Support, U.S. Gov{\textquoteright}t, P.H.S., Severity of Illness Index, Software}, author = {Gardner, W. and Shear, K. and Kelleher, K. J. and Pajer, K. A. and Mammen, O. and Buysse, D. and Frank, E.} } @article {11, title = {Computerized adaptive testing with multiple-form structures}, journal = {Applied Psychological Measurement}, volume = {28}, number = {3}, year = {2004}, pages = {147-164}, publisher = {Sage Publications: US}, abstract = {A multiple-form structure (MFS) is an ordered collection or network of testlets (i.e., sets of items). An examinee{\textquoteright}s progression through the network of testlets is dictated by the correctness of an examinee{\textquoteright}s answers, thereby adapting the test to his or her trait level. The collection of paths through the network yields the set of all possible test forms, allowing test specialists the opportunity to review them before they are administered. Also, limiting the exposure of an individual MFS to a specific period of time can enhance test security. This article provides an overview of methods that have been developed to generate parallel MFSs. The approach is applied to the assembly of an experimental computerized Law School Admission Test (LSAT). (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {computerized adaptive testing, Law School Admission Test, multiple-form structure, testlets}, isbn = {0146-6216 (Print)}, author = {Armstrong, R. D. and Jones, D. H. and Koppel, N. B. and Pashley, P. J.} } @article {44, title = {Computers in clinical assessment: Historical developments, present status, and future challenges}, journal = {Journal of Clinical Psychology}, volume = {60}, number = {3}, year = {2004}, pages = {331-345}, publisher = {John Wiley \& Sons: US}, abstract = {Computerized testing methods have long been regarded as a potentially powerful asset for providing psychological assessment services. Ever since computers were first introduced and adapted to the field of assessment psychology in the 1950s, they have been a valuable aid for scoring, data processing, and even interpretation of test results. The history and status of computer-based personality and neuropsychological tests are discussed in this article. Several pertinent issues involved in providing test interpretation by computer are highlighted. Advances in computer-based test use, such as computerized adaptive testing, are described and problems noted. Today, there is great interest in expanding the availability of psychological assessment applications on the Internet. Although these applications show great promise, there are a number of problems associated with providing psychological tests on the Internet that need to be addressed by psychologists before the Internet can become a major medium for psychological service delivery. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {clinical assessment, computerized testing method, Internet, psychological assessment services}, isbn = {0021-9762 (Print); 1097-4679 (Electronic)}, author = {Butcher, J. N. and Perry, J. L. and Hahn, J. A.} } @article {408, title = {Constraining item exposure in computerized adaptive testing with shadow tests}, journal = {Journal of Educational and Behavioral Statistics}, volume = {29}, number = {3}, year = {2004}, pages = {273-291}, publisher = {American Educational Research Assn: US}, abstract = {Item-exposure control in computerized adaptive testing is implemented by imposing item-ineligibility constraints on the assembly process of the shadow tests. The method resembles Sympson and Hetter{\textquoteright}s (1985) method of item-exposure control in that the decisions to impose the constraints are probabilistic. The method does not, however, require time-consuming simulation studies to set values for control parameters before the operational use of the test. Instead, it can set the probabilities of item ineligibility adaptively during the test using the actual item-exposure rates. An empirical study using an item pool from the Law School Admission Test showed that application of the method yielded perfect control of the item-exposure rates and had negligible impact on the bias and mean-squared error functions of the ability estimator. }, keywords = {computerized adaptive testing, item exposure control, item ineligibility constraints, Probability, shadow tests}, isbn = {1076-9986 (Print)}, author = {van der Linden, W. J. and Veldkamp, B. P.} } @article {10, title = {Constructing rotating item pools for constrained adaptive testing}, journal = {Journal of Educational Measurement}, volume = {41}, number = {4}, year = {2004}, pages = {345-359}, publisher = {Blackwell Publishing: United Kingdom}, abstract = {Preventing items in adaptive testing from being over- or underexposed is one of the main problems in computerized adaptive testing. Though the problem of overexposed items can be solved using a probabilistic item-exposure control method, such methods are unable to deal with the problem of underexposed items. Using a system of rotating item pools, on the other hand, is a method that potentially solves both problems. In this method, a master pool is divided into (possibly overlapping) smaller item pools, which are required to have similar distributions of content and statistical attributes. These pools are rotated among the testing sites to realize desirable exposure rates for the items. A test assembly model, motivated by Gulliksen{\textquoteright}s matched random subtests method, was explored to help solve the problem of dividing a master pool into a set of smaller pools. Different methods to solve the model are proposed. An item pool from the Law School Admission Test was used to evaluate the performances of computerized adaptive tests from systems of rotating item pools constructed using these methods. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {computerized adaptive tests, constrained adaptive testing, item exposure, rotating item pools}, isbn = {0022-0655 (Print)}, author = {Ariel, A. and Veldkamp, B. P. and van der Linden, W. J.} } @article {254, title = {The development and evaluation of a software prototype for computer-adaptive testing}, journal = {Computers and Education}, volume = {43}, number = {1-2}, year = {2004}, pages = {109-123}, keywords = {computerized adaptive testing}, author = {Lilley, M and Barker, T and Britton, C} } @article {69, title = {Effects of practical constraints on item selection rules at the early stages of computerized adaptive testing}, journal = {Journal of Educational Measurement}, volume = {41}, number = {2}, year = {2004}, pages = {149-174}, publisher = {Blackwell Publishing: United Kingdom}, abstract = {The purpose of this study was to compare the effects of four item selection rules--(1) Fisher information (F), (2) Fisher information with a posterior distribution (FP), (3) Kullback-Leibler information with a posterior distribution (KP), and (4) completely randomized item selection (RN)--with respect to the precision of trait estimation and the extent of item usage at the early stages of computerized adaptive testing. The comparison of the four item selection rules was carried out under three conditions: (1) using only the item information function as the item selection criterion; (2) using both the item information function and content balancing; and (3) using the item information function, content balancing, and item exposure control. When test length was less than 10 items, FP and KP tended to outperform F at extreme trait levels in Condition 1. However, in more realistic settings, it could not be concluded that FP and KP outperformed F, especially when item exposure control was imposed. When test length was greater than 10 items, the three nonrandom item selection procedures performed similarly no matter what the condition was, while F had slightly higher item usage. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {computerized adaptive testing, item selection rules, practical constraints}, isbn = {0022-0655 (Print)}, author = {Chen, S-Y. and Ankenmann, R. D.} } @article {245, title = {{\'E}valuation et multim{\'e}dia dans l{\textquoteright}apprentissage d{\textquoteright}une L2 [Assessment and multimedia in learning an L2]}, journal = {ReCALL}, volume = {16}, number = {2}, year = {2004}, pages = {475-487}, abstract = {In the first part of this paper different areas where technology may be used for second language assessment are described. First, item banking operations, which are generally based on item Response Theory but not necessarily restricted to dichotomously scored items, facilitate assessment task organization and require technological support. Second, technology may help to design more authentic assessment tasks or may be needed in some direct testing situations. Third, the assessment environment may be more adapted and more stimulating when technology is used to give the student more control. The second part of the paper presents different functions of assessment. The monitoring function (often called formative assessment) aims at adapting the classroom activities to students and to provide continuous feedback. Technology may be used to train the teachers in monitoring techniques, to organize data or to produce diagnostic information; electronic portfolios or quizzes that are built in some educational software may also be used for monitoring. The placement function is probably the one in which the application of computer adaptive testing procedures (e.g. French CAPT) is the most appropriate. Automatic scoring devices may also be used for placement purposes. Finally the certification function requires more valid and more reliable tools. Technology may be used to enhance the testing situation (to make it more authentic) or to facilitate data processing during the construction of a test. Almond et al. (2002) propose a four component model (Selection, Presentation, Scoring and Response) for designing assessment systems. Each component must be planned taking into account the assessment function. }, keywords = {Adaptive Testing, Computer Assisted Instruction, Educational, Foreign Language Learning, Program Evaluation, Technology computerized adaptive testing}, author = {Laurier, M.} } @article {291, title = {Evaluation of the CATSIB DIF procedure in a pretest setting}, journal = {Journal of Educational and Behavioral Statistics}, volume = {29}, number = {2}, year = {2004}, pages = {177-199}, publisher = {American Educational Research Assn: US}, abstract = {A new procedure, CATSIB, for assessing differential item functioning (DIF) on computerized adaptive tests (CATs) is proposed. CATSIB, a modified SIBTEST procedure, matches test takers on estimated ability and controls for impact-induced Type I error inflation by employing a CAT version of the SIBTEST "regression correction." The performance of CATSIB in terms of detection of DIF in pretest items was evaluated in a simulation study. Simulated test takers were adoptively administered 25 operational items from a pool of 1,000 and were linearly administered 16 pretest items that were evaluated for DIF. Sample size varied from 250 to 500 in each group. Simulated impact levels ranged from a 0- to 1-standard-deviation difference in mean ability levels. The results showed that CATSIB with the regression correction displayed good control over Type 1 error, whereas CATSIB without the regression correction displayed impact-induced Type 1 error inflation. With 500 test takers in each group, power rates were exceptionally high (84\% to 99\%) for values of DIF at the boundary between moderate and large DIF. For smaller samples of 250 test takers in each group, the corresponding power rates ranged from 47\% to 95\%. In addition, in all cases, CATSIB was very accurate in estimating the true values of DIF, displaying at most only minor estimation bias. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {computerized adaptive tests, differential item functioning}, isbn = {1076-9986 (Print)}, author = {Nandakumar, R. and Roussos, L. A.} } @proceedings {214, title = {An investigation of two combination procedures of SPRT for three-category classification decisions in computerized classification test}, journal = {annual meeting of the American Educational Research Association}, year = {2004}, note = {annual meeting of the American Educational Research Association, San Antonio}, month = {04/2004}, address = {San Antonio, Texas}, keywords = {computerized adaptive testing, Computerized classification testing, sequential probability ratio testing}, author = {Jiao, H. and Wang, S and Lau, CA} } @article {139, title = {Kann die Konfundierung von Konzentrationsleistung und Aktivierung durch adaptives Testen mit dern FAKT vermieden werden? [Avoiding the confounding of concentration performance and activation by adaptive testing with the FACT]}, journal = {Zeitschrift f{\"u}r Differentielle und Diagnostische Psychologie}, volume = {25}, number = {1}, year = {2004}, pages = {1-17}, abstract = {The study investigates the effect of computerized adaptive testing strategies on the confounding of concentration performance with activation. A sample of 54 participants was administered 1 out of 3 versions (2 adaptive, 1 non-adaptive) of the computerized Frankfurt Adaptive Concentration Test FACT (Moosbrugger \& Heyden, 1997) at three subsequent points in time. During the test administration changes in activation (electrodermal activity) were recorded. The results pinpoint a confounding of concentration performance with activation for the non-adaptive test version, but not for the adaptive test versions (p = .01). Thus, adaptive FACT testing strategies can remove the confounding of concentration performance with activation, thereby increasing the discriminant validity. In conclusion, an attention-focusing-hypothesis is formulated to explain the observed effect. (PsycINFO Database Record (c) 2005 APA ) (journal abstract)}, keywords = {Adaptive Testing, Computer Assisted Testing, Concentration, Performance, Testing computerized adaptive testing}, author = {Frey, A. and Moosbrugger, H.} } @article {381, title = {Pre-equating: a simulation study based on a large scale assessment model}, journal = {Journal of Applied Measurement}, volume = {5}, number = {3}, year = {2004}, note = {1529-7713Journal Article}, pages = {301-18}, abstract = {Although post-equating (PE) has proven to be an acceptable method in the scaling and equating of items and forms, there are times when the turn-around period for equating and converting raw scores to scale scores is so small that PE cannot be undertaken within the prescribed time frame. In such cases, pre-equating (PrE) could be considered as an acceptable alternative. Assessing the feasibility of using item calibrations from the item bank (as in PrE) is conditioned on the equivalency of the calibrations and the errors associated with it vis a vis the results obtained via PE. This paper creates item banks over three periods of item introduction into the banks and uses the Rasch model in examining data with respect to the recovery of item parameters, the measurement error, and the effect cut-points have on examinee placement in both the PrE and PE situations. Results indicate that PrE is a viable solution to PE provided the stability of the item calibrations are enhanced by using large sample sizes (perhaps as large as full-population) in populating the item bank.}, keywords = {*Databases, *Models, Theoretical, Calibration, Human, Psychometrics, Reference Values, Reproducibility of Results}, author = {Taherbhai, H. M. and Young, M. J.} } @article {82, title = {Siette: a web-based tool for adaptive testing}, journal = {International Journal of Artificial Intelligence in Education}, volume = {14}, number = {1}, year = {2004}, pages = {29-61}, keywords = {computerized adaptive testing}, author = {Conejo, R and Guzm{\'a}n, E and Mill{\'a}n, E and Trella, M and P{\'e}rez-De-La-Cruz, JL and R{\'\i}os, A} } @article {93, title = {Strategies for controlling item exposure in computerized adaptive testing with the generalized partial credit model}, journal = {Applied Psychological Measurement}, volume = {28}, number = {3}, year = {2004}, pages = {165-185}, publisher = {Sage Publications: US}, abstract = {Choosing a strategy for controlling item exposure has become an integral part of test development for computerized adaptive testing (CAT). This study investigated the performance of six procedures for controlling item exposure in a series of simulated CATs under the generalized partial credit model. In addition to a no-exposure control baseline condition, the randomesque, modified-within-.10-logits, Sympson-Hetter, conditional Sympson-Hetter, a-stratified with multiple-stratification, and enhanced a-stratified with multiple-stratification procedures were implemented to control exposure rates. Two variations of the randomesque and modified-within-.10-logits procedures were examined, which varied the size of the item group from which the next item to be administered was randomly selected. The results indicate that although the conditional Sympson-Hetter provides somewhat lower maximum exposure rates, the randomesque and modified-within-.10-logits procedures with the six-item group variation have great utility for controlling overlap rates and increasing pool utilization and should be given further consideration. (PsycINFO Database Record (c) 2007 APA, all rights reserved)}, keywords = {computerized adaptive testing, generalized partial credit model, item exposure}, isbn = {0146-6216 (Print)}, author = {Davis, L. L.} } @article {278, title = {Using patterns of summed scores in paper-and-pencil tests and computer-adaptive tests to detect misfitting item score patterns}, journal = {Journal of Educational Measurement}, volume = {41}, number = {2}, year = {2004}, pages = {119-136}, abstract = {Two new methods have been proposed to determine unexpected sum scores on subtests (testlets) both for paper-and-pencil tests and computer adaptive tests. A method based on a conservative bound using the hypergeometric distribution, denoted ρ, was compared with a method where the probability for each score combination was calculated using a highest density region (HDR). Furthermore, these methods were compared with the standardized log-likelihood statistic with and without a correction for the estimated latent trait value (denoted as l-super(*)-sub(z) and l-sub(z), respectively). Data were simulated on the basis of the one-parameter logistic model, and both parametric and nonparametric logistic regression was used to obtain estimates of the latent trait. Results showed that it is important to take the trait level into account when comparing subtest scores. In a nonparametric item response theory (IRT) context, on adapted version of the HDR method was a powerful alterative to ρ. In a parametric IRT context, results showed that l-super(*)-sub(z) had the highest power when the data were simulated conditionally on the estimated latent trait level. (PsycINFO Database Record (c) 2005 APA ) (journal abstract)}, keywords = {Computer Assisted Testing, Item Response Theory, person Fit, Test Scores}, author = {Meijer, R. R.} } @inbook {42, title = {Assessing question banks}, booktitle = {Reusing online resources: A sustanable approach to e-learning}, number = {1}, year = {2003}, pages = {171-230}, publisher = {Kogan Page Ltd.}, organization = {Kogan Page Ltd.}, address = {London, UK}, abstract = {In Chapter 14, Joanna Bull and James Daziel provide a comprehensive treatment of the issues surrounding the use of Question Banks and Computer Assisted Assessment, and provide a number of excellent examples of implementations. In their review of the technologies employed in Computer Assisted Assessment the authors include Computer Adaptive Testing and data generation. The authors reveal significant issues involving the impact of Intellectual Property rights and computer assisted assessment and make important suggestions for strategies to overcome these obstacles. (PsycINFO Database Record (c) 2005 APA )http://www-jime.open.ac.uk/2003/1/ (journal abstract)}, keywords = {Computer Assisted Testing, Curriculum Based Assessment, Education, Technology computerized adaptive testing}, author = {Bull, J. and Dalziel, J. and Vreeland, T.} } @article {275, title = {A Bayesian method for the detection of item preknowledge in computerized adaptive testing}, journal = {Applied Psychological Measurement}, volume = {27}, number = {2}, year = {2003}, pages = {121-137}, abstract = {With the increased use of continuous testing in computerized adaptive testing, new concerns about test security have evolved, such as how to ensure that items in an item pool are safeguarded from theft. In this article, procedures to detect test takers using item preknowledge are explored. When test takers use item preknowledge, their item responses deviate from the underlying item response theory (IRT) model, and estimated abilities may be inflated. This deviation may be detected through the use of person-fit indices. A Bayesian posterior log odds ratio index is proposed for detecting the use of item preknowledge. In this approach to person fit, the estimated probability that each test taker has preknowledge of items is updated after each item response. These probabilities are based on the IRT parameters, a model specifying the probability that each item has been memorized, and the test taker{\textquoteright}s item responses. Simulations based on an operational computerized adaptive test (CAT) pool are used to demonstrate the use of the odds ratio index. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Adaptive Testing, Cheating, Computer Assisted Testing, Individual Differences computerized adaptive testing, Item, Item Analysis (Statistical), Mathematical Modeling, Response Theory}, author = {McLeod, L. and Lewis, C. and Thissen, D.} } @article {30, title = {Calibration of an item pool for assessing the burden of headaches: an application of item response theory to the Headache Impact Test (HIT)}, journal = {Quality of Life Research}, volume = {12}, number = {8}, year = {2003}, note = {0962-9343Journal Article}, pages = {913-933}, abstract = {BACKGROUND: Measurement of headache impact is important in clinical trials, case detection, and the clinical monitoring of patients. Computerized adaptive testing (CAT) of headache impact has potential advantages over traditional fixed-length tests in terms of precision, relevance, real-time quality control and flexibility. OBJECTIVE: To develop an item pool that can be used for a computerized adaptive test of headache impact. METHODS: We analyzed responses to four well-known tests of headache impact from a population-based sample of recent headache sufferers (n = 1016). We used confirmatory factor analysis for categorical data and analyses based on item response theory (IRT). RESULTS: In factor analyses, we found very high correlations between the factors hypothesized by the original test constructers, both within and between the original questionnaires. These results suggest that a single score of headache impact is sufficient. We established a pool of 47 items which fitted the generalized partial credit IRT model. By simulating a computerized adaptive health test we showed that an adaptive test of only five items had a very high concordance with the score based on all items and that different worst-case item selection scenarios did not lead to bias. CONCLUSION: We have established a headache impact item pool that can be used in CAT of headache impact.}, keywords = {*Cost of Illness, *Decision Support Techniques, *Sickness Impact Profile, Adolescent, Adult, Aged, Comparative Study, Disability Evaluation, Factor Analysis, Statistical, Headache/*psychology, Health Surveys, Human, Longitudinal Studies, Middle Aged, Migraine/psychology, Models, Psychological, Psychometrics/*methods, Quality of Life/*psychology, Software, Support, Non-U.S. Gov{\textquoteright}t}, author = {Bjorner, J. B. and Kosinski, M. and Ware, J. E., Jr.} } @article {63, title = {A comparative study of item exposure control methods in computerized adaptive testing}, journal = {Journal of Educational Measurement}, volume = {40}, number = {1}, year = {2003}, pages = {71-103}, abstract = {This study compared the properties of five methods of item exposure control within the purview of estimating examinees{\textquoteright} abilities in a computerized adaptive testing (CAT) context. Each exposure control algorithm was incorporated into the item selection procedure and the adaptive testing progressed based on the CAT design established for this study. The merits and shortcomings of these strategies were considered under different item pool sizes and different desired maximum exposure rates and were evaluated in light of the observed maximum exposure rates, the test overlap rates, and the conditional standard errors of measurement. Each method had its advantages and disadvantages, but no one possessed all of the desired characteristics. There was a clear and logical trade-off between item exposure control and measurement precision. The M. L. Stocking and C. Lewis conditional multinomial procedure and, to a slightly lesser extent, the T. Davey and C. G. Parshall method seemed to be the most promising considering all of the factors that this study addressed. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Adaptive Testing, Computer Assisted Testing, Educational, Item Analysis (Statistical), Measurement, Strategies computerized adaptive testing}, author = {Chang, S-W. and Ansley, T. N.} } @article {349, title = {Computerized adaptive rating scales for measuring managerial performance}, journal = {International Journal of Selection and Assessment}, volume = {11}, number = {2-3}, year = {2003}, pages = {237-246}, abstract = {Computerized adaptive rating scales (CARS) had been developed to measure contextual or citizenship performance. This rating format used a paired-comparison protocol, presenting pairs of behavioral statements scaled according to effectiveness levels, and an iterative item response theory algorithm to obtain estimates of ratees{\textquoteright} citizenship performance (W. C. Borman et al, 2001). In the present research, we developed CARS to measure the entire managerial performance domain, including task and citizenship performance, thus addressing a major limitation of the earlier CARS. The paper describes this development effort, including an adjustment to the algorithm that reduces substantially the number of item pairs required to obtain almost as much precision in the performance estimates. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Adaptive Testing, Algorithms, Associations, Citizenship, Computer Assisted Testing, Construction, Contextual, Item Response Theory, Job Performance, Management, Management Personnel, Rating Scales, Test}, author = {Schneider, R. J. and Goff, M. and Anderson, S. and Borman, W. C.} } @article {75, title = {Computerized adaptive testing using the nearest-neighbors criterion}, journal = {Applied Psychological Measurement}, volume = {27}, number = {3}, year = {2003}, pages = {204-216}, abstract = {Item selection procedures designed for computerized adaptive testing need to accurately estimate every taker{\textquoteright}s trait level (θ) and, at the same time, effectively use all items in a bank. Empirical studies showed that classical item selection procedures based on maximizing Fisher or other related information yielded highly varied item exposure rates; with these procedures, some items were frequently used whereas others were rarely selected. In the literature, methods have been proposed for controlling exposure rates; they tend to affect the accuracy in θ estimates, however. A modified version of the maximum Fisher information (MFI) criterion, coined the nearest neighbors (NN) criterion, is proposed in this study. The NN procedure improves to a moderate extent the undesirable item exposure rates associated with the MFI criterion and keeps sufficient precision in estimates. The NN criterion will be compared with a few other existing methods in an empirical study using the mean squared errors in θ estimates and plots of item exposure rates associated with different distributions. (PsycINFO Database Record (c) 2005 APA ) (journal abstract)}, keywords = {(Statistical), Adaptive Testing, Computer Assisted Testing, Item Analysis, Item Response Theory, Statistical Analysis, Statistical Estimation computerized adaptive testing, Statistical Tests}, author = {Cheng, P. E. and Liou, M.} } @article {156, title = {Computerized adaptive testing with item cloning}, journal = {Applied Psychological Measurement}, volume = {27}, number = {4}, year = {2003}, note = {References .Sage Publications, US}, pages = {247-261}, abstract = {(from the journal abstract) To increase the number of items available for adaptive testing and reduce the cost of item writing, the use of techniques of item cloning has been proposed. An important consequence of item cloning is possible variability between the item parameters. To deal with this variability, a multilevel item response (IRT) model is presented which allows for differences between the distributions of item parameters of families of item clones. A marginal maximum likelihood and a Bayesian procedure for estimating the hyperparameters are presented. In addition, an item-selection procedure for computerized adaptive testing with item cloning is presented which has the following two stages: First, a family of item clones is selected to be optimal at the estimate of the person parameter. Second, an item is randomly selected from the family for administration. Results from simulation studies based on an item pool from the Law School Admission Test (LSAT) illustrate the accuracy of these item pool calibration and adaptive testing procedures. (PsycINFO Database Record (c) 2003 APA, all rights reserved).}, keywords = {computerized adaptive testing}, author = {Glas, C. A. W. and van der Linden, W. J.} } @article {31, title = {The feasibility of applying item response theory to measures of migraine impact: a re-analysis of three clinical studies}, journal = {Quality of Life Research}, volume = {12}, number = {8}, year = {2003}, note = {0962-9343Journal Article}, pages = {887-902}, abstract = {BACKGROUND: Item response theory (IRT) is a powerful framework for analyzing multiitem scales and is central to the implementation of computerized adaptive testing. OBJECTIVES: To explain the use of IRT to examine measurement properties and to apply IRT to a questionnaire for measuring migraine impact--the Migraine Specific Questionnaire (MSQ). METHODS: Data from three clinical studies that employed the MSQ-version 1 were analyzed by confirmatory factor analysis for categorical data and by IRT modeling. RESULTS: Confirmatory factor analyses showed very high correlations between the factors hypothesized by the original test constructions. Further, high item loadings on one common factor suggest that migraine impact may be adequately assessed by only one score. IRT analyses of the MSQ were feasible and provided several suggestions as to how to improve the items and in particular the response choices. Out of 15 items, 13 showed adequate fit to the IRT model. In general, IRT scores were strongly associated with the scores proposed by the original test developers and with the total item sum score. Analysis of response consistency showed that more than 90\% of the patients answered consistently according to a unidimensional IRT model. For the remaining patients, scores on the dimension of emotional function were less strongly related to the overall IRT scores that mainly reflected role limitations. Such response patterns can be detected easily using response consistency indices. Analysis of test precision across score levels revealed that the MSQ was most precise at one standard deviation worse than the mean impact level for migraine patients that are not in treatment. Thus, gains in test precision can be achieved by developing items aimed at less severe levels of migraine impact. CONCLUSIONS: IRT proved useful for analyzing the MSQ. The approach warrants further testing in a more comprehensive item pool for headache impact that would enable computerized adaptive testing.}, keywords = {*Sickness Impact Profile, Adolescent, Adult, Aged, Comparative Study, Cost of Illness, Factor Analysis, Statistical, Feasibility Studies, Female, Human, Male, Middle Aged, Migraine/*psychology, Models, Psychological, Psychometrics/instrumentation/*methods, Quality of Life/*psychology, Questionnaires, Support, Non-U.S. Gov{\textquoteright}t}, author = {Bjorner, J. B. and Kosinski, M. and Ware, J. E., Jr.} } @article {250, title = {Incorporation of Content Balancing Requirements in Stratification Designs for Computerized Adaptive Testing}, journal = {Educational and Psychological Measurement}, volume = {63}, number = {2}, year = {2003}, pages = {257-70}, abstract = {Studied three stratification designs for computerized adaptive testing in conjunction with three well-developed content balancing methods. Simulation study results show substantial differences in item overlap rate and pool utilization among different methods. Recommends an optimal combination of stratification design and content balancing method. (SLD)}, keywords = {computerized adaptive testing}, author = {Leung, C-K.. and Chang, Hua-Hua and Hau, K-T.} } @article {94, title = {Item exposure constraints for testlets in the verbal reasoning section of the MCAT}, journal = {Applied Psychological Measurement}, volume = {27}, number = {5}, year = {2003}, pages = {335-356}, abstract = {The current study examined item exposure control procedures for testlet scored reading passages in the Verbal Reasoning section of the Medical College Admission Test with four computerized adaptive testing (CAT) systems using the partial credit model. The first system used a traditional CAT using maximum information item selection. The second used random item selection to provide a baseline for optimal exposure rates. The third used a variation of Lunz and Stahl{\textquoteright}s randomization procedure. The fourth used Luecht and Nungester{\textquoteright}s computerized adaptive sequential testing (CAST) system. A series of simulated fixed-length CATs was run to determine the optimal item length selection procedure. Results indicated that both the randomization procedure and CAST performed well in terms of exposure control and measurement precision, with the CAST system providing the best overall solution when all variables were taken into consideration. (PsycINFO Database Record (c) 2005 APA ) (journal abstract)}, keywords = {Adaptive Testing, Computer Assisted Testing, Entrance Examinations, Item Response Theory, Random Sampling, Reasoning, Verbal Ability computerized adaptive testing}, author = {Davis, L. L. and Dodd, B. G.} } @inbook {414, title = {Item selection in polytomous CAT}, booktitle = {New developments in psychometrics}, year = {2003}, pages = {207{\textendash}214}, publisher = {Psychometric Society, Springer}, organization = {Psychometric Society, Springer}, address = {Tokyo, Japan}, keywords = {computerized adaptive testing}, author = {Veldkamp, B. P.}, editor = {A. Okada and K. Shigenasu and Y. Kano and J. Meulman} } @article {57, title = {Optimal stratification of item pools in α-stratified computerized adaptive testing}, journal = {Applied Psychological Measurement}, volume = {27}, number = {4}, year = {2003}, pages = {262-274}, abstract = {A method based on 0-1 linear programming (LP) is presented to stratify an item pool optimally for use in α-stratified adaptive testing. Because the 0-1 LP model belongs to the subclass of models with a network flow structure, efficient solutions are possible. The method is applied to a previous item pool from the computerized adaptive testing (CAT) version of the Graduate Record Exams (GRE) Quantitative Test. The results indicate that the new method performs well in practical situations. It improves item exposure control, reduces the mean squared error in the θ estimates, and increases test reliability. (PsycINFO Database Record (c) 2005 APA ) (journal abstract)}, keywords = {Adaptive Testing, Computer Assisted Testing, Item Content (Test), Item Response Theory, Mathematical Modeling, Test Construction computerized adaptive testing}, author = {Chang, Hua-Hua and van der Linden, W. J.} } @article {68, title = {The relationship between item exposure and test overlap in computerized adaptive testing}, journal = {Journal of Educational Measurement}, volume = {40}, number = {2}, year = {2003}, pages = {129-145}, abstract = {The purpose of this article is to present an analytical derivation for the mathematical form of an average between-test overlap index as a function of the item exposure index, for fixed-length computerized adaptive tests (CATs). This algebraic relationship is used to investigate the simultaneous control of item exposure at both the item and test levels. The results indicate that, in fixed-length CATs, control of the average between-test overlap is achieved via the mean and variance of the item exposure rates of the items that constitute the CAT item pool. The mean of the item exposure rates is easily manipulated. Control over the variance of the item exposure rates can be achieved via the maximum item exposure rate (r-sub(max)). Therefore, item exposure control methods which implement a specification of r-sub(max) (e.g., J. B. Sympson and R. D. Hetter, 1985) provide the most direct control at both the item and test levels. (PsycINFO Database Record (c) 2005 APA )}, keywords = {(Statistical), Adaptive Testing, Computer Assisted Testing, Human Computer, Interaction computerized adaptive testing, Item Analysis, Item Analysis (Test), Test Items}, author = {Chen, S-Y. and Ankemann, R. D. and Spray, J. A.} } @article {397, title = {Some alternatives to Sympson-Hetter item-exposure control in computerized adaptive testing}, journal = {Journal of Educational and Behavioral Statistics}, volume = {28}, number = {3}, year = {2003}, pages = {249-265}, abstract = {TheHetter and Sympson (1997; 1985) method is a method of probabilistic item-exposure control in computerized adaptive testing. Setting its control parameters to admissible values requires an iterative process of computer simulations that has been found to be time consuming, particularly if the parameters have to be set conditional on a realistic set of values for the examinees{\textquoteright} ability parameter. Formal properties of the method are identified that help us explain why this iterative process can be slow and does not guarantee admissibility. In addition, some alternatives to the SH method are introduced. The behavior of these alternatives was estimated for an adaptive test from an item pool from the Law School Admission Test (LSAT). Two of the alternatives showed attractive behavior and converged smoothly to admissibility for all items in a relatively small number of iteration steps. }, keywords = {Adaptive Testing, Computer Assisted Testing, Test Items computerized adaptive testing}, author = {van der Linden, W. J.} } @article {321, title = {Timing behavior in computerized adaptive testing: Response times for correct and incorrect answers are not related to general fluid intelligence/Zum Zeitverhalten beim computergest{\"u}tzten adaptiveb Testen: Antwortlatenzen bei richtigen und falschen L{\"o}sun}, journal = {Zeitschrift f{\"u}r Differentielle und Diagnostische Psychologie}, volume = {24}, number = {1}, year = {2003}, pages = {57-63}, abstract = {Examined the effects of general fluid intelligence on item response times for correct and false responses in computerized adaptive testing. After performing the CFT3 intelligence test, 80 individuals (aged 17-44 yrs) completed perceptual and cognitive discrimination tasks. Results show that response times were related neither to the proficiency dimension reflected by the task nor to the individual level of fluid intelligence. Furthermore, the false > correct-phenomenon as well as substantial positive correlations between item response times for false and correct responses were shown to be independent of intelligence levels. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Adaptive Testing, Cognitive Ability, Intelligence, Perception, Reaction Time computerized adaptive testing}, author = {Rammsayer, Thomas and Brandler, Susanne} } @article {407, title = {Using response times to detect aberrant responses in computerized adaptive testing}, journal = {Psychometrika}, volume = {68}, number = {2}, year = {2003}, pages = {251-265}, abstract = {A lognormal model for response times is used to check response times for aberrances in examinee behavior on computerized adaptive tests. Both classical procedures and Bayesian posterior predictive checks are presented. For a fixed examinee, responses and response times are independent; checks based on response times offer thus information independent of the results of checks on response patterns. Empirical examples of the use of classical and Bayesian checks for detecting two different types of aberrances in response times are presented. The detection rates for the Bayesian checks outperformed those for the classical checks, but at the cost of higher false-alarm rates. A guideline for the choice between the two types of checks is offered.}, keywords = {Adaptive Testing, Behavior, Computer Assisted Testing, computerized adaptive testing, Models, person Fit, Prediction, Reaction Time}, author = {van der Linden, W. J. and van Krimpen-Stoop, E. M. L. A.} } @article {48, title = {Advances in quality of life measurements in oncology patients}, journal = {Seminars in Oncology}, volume = {29}, number = {3 Suppl 8}, year = {2002}, note = {0093-7754 (Print)Journal ArticleReview}, month = {Jun}, pages = {60-8}, abstract = {Accurate assessment of the quality of life (QOL) of patients can provide important clinical information to physicians, especially in the area of oncology. Changes in QOL are important indicators of the impact of a new cytotoxic therapy, can affect a patient{\textquoteright}s willingness to continue treatment, and may aid in defining response in the absence of quantifiable endpoints such as tumor regression. Because QOL is becoming an increasingly important aspect in the management of patients with malignant disease, it is vital that the instruments used to measure QOL are reliable and accurate. Assessment of QOL involves a multidimensional approach that includes physical, functional, social, and emotional well-being, and the most comprehensive instruments measure at least three of these domains. Instruments to measure QOL can be generic (eg, the Nottingham Health Profile), targeted toward specific illnesses (eg, Functional Assessment of Cancer Therapy - Lung), or be a combination of generic and targeted. Two of the most widely used examples of the combination, or hybrid, instruments are the European Organization for Research and Treatment of Cancer Quality of Life Questionnaire Core 30 Items and the Functional Assessment of Chronic Illness Therapy. A consequence of the increasing international collaboration in clinical trials has been the growing necessity for instruments that are valid across languages and cultures. To assure the continuing reliability and validity of QOL instruments in this regard, item response theory can be applied. Techniques such as item response theory may be used in the future to construct QOL item banks containing large sets of validated questions that represent various levels of QOL domains. As QOL becomes increasingly important in understanding and approaching the overall management of cancer patients, the tools available to clinicians and researchers to assess QOL will continue to evolve. While the instruments currently available provide reliable and valid measurement, further improvements in precision and application are anticipated.}, keywords = {*Quality of Life, *Sickness Impact Profile, Cross-Cultural Comparison, Culture, Humans, Language, Neoplasms/*physiopathology, Questionnaires}, author = {Cella, D. and Chang, C-H. and Lai, J. S. and Webster, K.} } @article {305, title = {Assessing tobacco beliefs among youth using item response theory models}, journal = {Drug and Alcohol Dependence}, volume = {68}, number = {Suppl 1}, year = {2002}, note = {0376-8716Journal Article}, month = {Nov}, pages = {S21-S39}, abstract = {Successful intervention research programs to prevent adolescent smoking require well-chosen, psychometrically sound instruments for assessing smoking prevalence and attitudes. Twelve thousand eight hundred and ten adolescents were surveyed about their smoking beliefs as part of the Teenage Attitudes and Practices Survey project, a prospective cohort study of predictors of smoking initiation among US adolescents. Item response theory (IRT) methods are used to frame a discussion of questions that a researcher might ask when selecting an optimal item set. IRT methods are especially useful for choosing items during instrument development, trait scoring, evaluating item functioning across groups, and creating optimal item subsets for use in specialized applications such as computerized adaptive testing. Data analytic steps for IRT modeling are reviewed for evaluating item quality and differential item functioning across subgroups of gender, age, and smoking status. Implications and challenges in the use of these methods for tobacco onset research and for assessing the developmental trajectories of smoking among youth are discussed.}, keywords = {*Attitude to Health, *Culture, *Health Behavior, *Questionnaires, Adolescent, Adult, Child, Female, Humans, Male, Models, Statistical, Smoking/*epidemiology}, author = {Panter, A. T. and Reeve, B. B.} } @article {308, title = {A comparison of item selection techniques and exposure control mechanisms in CATs using the generalized partial credit model}, journal = {Applied Psychological Measurement}, volume = {26}, number = {2}, year = {2002}, pages = {147-163}, abstract = {The use of more performance items in large-scale testing has led to an increase in the research investigating the use of polytomously scored items in computer adaptive testing (CAT). Because this research has to be complemented with information pertaining to exposure control, the present research investigated the impact of using five different exposure control algorithms in two sized item pools calibrated using the generalized partial credit model. The results of the simulation study indicated that the a-stratified design, in comparison to a no-exposure control condition, could be used to reduce item exposure and overlap, increase pool utilization, and only minorly degrade measurement precision. Use of the more restrictive exposure control algorithms, such as the Sympson-Hetter and conditional Sympson-Hetter, controlled exposure to a greater extent but at the cost of measurement precision. Because convergence of the exposure control parameters was problematic for some of the more restrictive exposure control algorithms, use of the more simplistic exposure control mechanisms, particularly when the test length to item pool size ratio is large, is recommended. (PsycINFO Database Record (c) 2005 APA ) (journal abstract)}, keywords = {(Statistical), Adaptive Testing, Algorithms computerized adaptive testing, Computer Assisted Testing, Item Analysis, Item Response Theory, Mathematical Modeling}, author = {Pastor, D. A. and Dodd, B. G. and Chang, Hua-Hua} } @article {391, title = {Computer adaptive testing: The impact of test characteristics on perceived performance and test takers{\textquoteright} reactions}, journal = {Dissertation Abstracts International: Section B: the Sciences \& Engineering}, volume = {62}, number = {7-B}, year = {2002}, pages = {3410}, abstract = {This study examined the relationship between characteristics of adaptive testing and test takers{\textquoteright} subsequent reactions to the test. Participants took a computer adaptive test in which two features, the difficulty of the initial item and the difficulty of subsequent items, were manipulated. These two features of adaptive testing determined the number of items answered correctly by examinees and their subsequent reactions to the test. The data show that the relationship between test characteristics and reactions was fully mediated by perceived performance on the test. In addition, the impact of feedback on reactions to adaptive testing was also evaluated. In general, feedback that was consistent with perceptions of performance had a positive impact on reactions to the test. Implications for adaptive test design concerning maximizing test takers{\textquoteright} reactions are discussed. (PsycINFO Database Record (c) 2003 APA, all rights reserved).}, keywords = {computerized adaptive testing}, author = {Tonidandel, S.} } @article {242, title = {Computerised adaptive testing}, journal = {British Journal of Educational Technology}, volume = {33}, number = {5}, year = {2002}, pages = {619-22}, abstract = {Considers the potential of computer adaptive testing (CAT). Discusses the use of CAT instead of traditional paper and pencil tests, identifies decisions that impact the efficacy of CAT, and concludes that CAT is beneficial when used to its full potential on certain types of tests. (LRW)}, keywords = {computerized adaptive testing}, author = {Latu, E. and Chapman, E.} } @article {14, title = {Data sparseness and on-line pretest item calibration-scaling methods in CAT}, journal = {Journal of Educational Measurement}, volume = {39}, number = {3}, year = {2002}, pages = {207-218}, abstract = {Compared and evaluated 3 on-line pretest item calibration-scaling methods (the marginal maximum likelihood estimate with 1 expectation maximization [EM] cycle [OEM] method, the marginal maximum likelihood estimate with multiple EM cycles [MEM] method, and M. L. Stocking{\textquoteright}s Method B) in terms of item parameter recovery when the item responses to the pretest items in the pool are sparse. Simulations of computerized adaptive tests were used to evaluate the results yielded by the three methods. The MEM method produced the smallest average total error in parameter estimation, and the OEM method yielded the largest total error (PsycINFO Database Record (c) 2005 APA )}, keywords = {Computer Assisted Testing, Educational Measurement, Item Response Theory, Maximum Likelihood, Methodology, Scaling (Testing), Statistical Data}, author = {Ban, J-C. and Hanson, B. A. and Yi, Q. and Harris, D. J.} } @article {335, title = {The effect of test characteristics on aberrant response patterns in computer adaptive testing}, journal = {Dissertation Abstracts International Section A: Humanities \& Social Sciences}, volume = {62}, number = {10-A}, year = {2002}, pages = {3363}, abstract = {The advantages that computer adaptive testing offers over linear tests have been well documented. The Computer Adaptive Test (CAT) design is more efficient than the Linear test design as fewer items are needed to estimate an examinee{\textquoteright}s proficiency to a desired level of precision. In the ideal situation, a CAT will result in examinees answering different number of items according to the stopping rule employed. Unfortunately, the realities of testing conditions have necessitated the imposition of time and minimum test length limits on CATs. Such constraints might place a burden on the CAT test taker resulting in aberrant response behaviors by some examinees. Occurrence of such response patterns results in inaccurate estimation of examinee proficiency levels. This study examined the effects of test lengths, time limits and the interaction of these factors with the examinee proficiency levels on the occurrence of aberrant response patterns. The focus of the study was on the aberrant behaviors caused by rushed guessing due to restrictive time limits. Four different testing scenarios were examined; fixed length performance tests with and without content constraints, fixed length mastery tests and variable length mastery tests without content constraints. For each of these testing scenarios, the effect of two test lengths, five different timing conditions and the interaction between these factors with three ability levels on ability estimation were examined. For fixed and variable length mastery tests, decision accuracy was also looked at in addition to the estimation accuracy. Several indices were used to evaluate the estimation and decision accuracy for different testing conditions. The results showed that changing time limits had a significant impact on the occurrence of aberrant response patterns conditional on ability. Increasing test length had negligible if not negative effect on ability estimation when rushed guessing occured. In case of performance testing high ability examinees while in classification testing middle ability examinees suffered the most. The decision accuracy was considerably affected in case of variable length classification tests. (PsycINFO Database Record (c) 2003 APA, all rights reserved).}, keywords = {computerized adaptive testing}, author = {Rizavi, S. M.} } @article {370, title = {An EM approach to parameter estimation for the Zinnes and Griggs paired comparison IRT model}, journal = {Applied Psychological Measurement}, volume = {26}, number = {2}, year = {2002}, pages = {208-227}, abstract = {Borman et al. recently proposed a computer adaptive performance appraisal system called CARS II that utilizes paired comparison judgments of behavioral stimuli. To implement this approach,the paired comparison ideal point model developed by Zinnes and Griggs was selected. In this article,the authors describe item response and information functions for the Zinnes and Griggs model and present procedures for estimating stimulus and person parameters. Monte carlo simulations were conducted to assess the accuracy of the parameter estimation procedures. The results indicated that at least 400 ratees (i.e.,ratings) are required to obtain reasonably accurate estimates of the stimulus parameters and their standard errors. In addition,latent trait estimation improves as test length increases. The implications of these results for test construction are also discussed. }, keywords = {Adaptive Testing, Computer Assisted Testing, Item Response Theory, Maximum Likelihood, Personnel Evaluation, Statistical Correlation, Statistical Estimation}, author = {Stark, S. and F Drasgow} } @conference {223, title = {An empirical comparison of achievement level estimates from adaptive tests and paper-and-pencil tests}, booktitle = {annual meeting of the American Educational Research Association}, year = {2002}, address = {New Orleans, LA. USA}, keywords = {computerized adaptive testing}, author = {Kingsbury, G. G.} } @article {412, title = {Evaluation of selection procedures for computerized adaptive testing with polytomous items}, journal = {Applied Psychological Measurement}, volume = {26}, number = {4}, year = {2002}, note = {References .Sage Publications, US}, pages = {393-411}, abstract = {In the present study, a procedure that has been used to select dichotomous items in computerized adaptive testing was applied to polytomous items. This procedure was designed to select the item with maximum weighted information. In a simulation study, the item information function was integrated over a fixed interval of ability values and the item with the maximum area was selected. This maximum interval information item selection procedure was compared to a maximum point information item selection procedure. Substantial differences between the two item selection procedures were not found when computerized adaptive tests were evaluated on bias and the root mean square of the ability estimate. }, keywords = {computerized adaptive testing}, author = {van Rijn, P. W. and Theo Eggen and Hemker, B. T. and Sanders, P. F.} } @inbook {119, title = {Generating abstract reasoning items with cognitive theory}, booktitle = {Item generation for test development}, year = {2002}, note = {Using Smart Source ParsingItem generation for test development. (pp. 219-250). Mahwah, NJ : Lawrence Erlbaum Associates, Publishers. xxxii, 412 pp}, pages = {219-250}, publisher = {Lawrence Erlbaum Associates, Inc.}, organization = {Lawrence Erlbaum Associates, Inc.}, address = {Mahwah, N.J. USA}, abstract = {(From the chapter) Developed and evaluated a generative system for abstract reasoning items based on cognitive theory. The cognitive design system approach was applied to generate matrix completion problems. Study 1 involved developing the cognitive theory with 191 college students who were administered Set I and Set II of the Advanced Progressive Matrices. Study 2 examined item generation by cognitive theory. Study 3 explored the psychometric properties and construct representation of abstract reasoning test items with 728 young adults. Five structurally equivalent forms of Abstract Reasoning Test (ART) items were prepared from the generated item bank and administered to the Ss. In Study 4, the nomothetic span of construct validity of the generated items was examined with 728 young adults who were administered ART items, and 217 young adults who were administered ART items and the Advanced Progressive Matrices. Results indicate the matrix completion items were effectively generated by the cognitive design system approach. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Cognitive Processes, Measurement, Reasoning, Test Construction, Test Items, Test Validity, Theories}, author = {Embretson, S. E.}, editor = {P. Kyllomen} } @article {60, title = {Hypergeometric family and item overlap rates in computerized adaptive testing}, journal = {Psychometrika}, volume = {67}, number = {3}, year = {2002}, pages = {387-398}, abstract = {A computerized adaptive test (CAT) is usually administered to small groups of examinees at frequent time intervals. It is often the case that examinees who take the test earlier share information with examinees who will take the test later, thus increasing the risk that many items may become known. Item overlap rate for a group of examinees refers to the number of overlapping items encountered by these examinees divided by the test length. For a specific item pool, different item selection algorithms may yield different item overlap rates. An important issue in designing a good CAT item selection algorithm is to keep item overlap rate below a preset level. In doing so, it is important to investigate what the lowest rate could be for all possible item selection algorithms. In this paper we rigorously prove that if every item had an equal possibility to be selected from the pool in a fixed-length CAT, the number of overlapping item among any α randomly sampled examinees follows the hypergeometric distribution family for α >= 1. Thus, the expected values of the number of overlapping items among any randomly sampled α examinee can be calculated precisely. These values may serve as benchmarks in controlling item overlap rates for fixed-length adaptive tests. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Adaptive Testing, Algorithms, Computer Assisted Testing, Taking, Test, Time On Task computerized adaptive testing}, author = {Chang, Hua-Hua and Zhang, J.} } @article {161, title = {The implications of the use of non-optimal items in a Computer Adaptive Testing (CAT) environment}, journal = {Dissertation Abstracts International: Section B: the Sciences \& Engineering}, volume = {63}, number = {3-B}, year = {2002}, pages = {1606}, abstract = {This study describes the effects of manipulating item difficulty in a computer adaptive testing (CAT) environment. There are many potential benefits when using CATS as compared to traditional tests. These include increased security, shorter tests, and more precise measurement. According to IRT, the theory underlying CAT, as the computer continually recalculates ability, items that match that current estimate of ability are administered. Such items provide maximum information about examinees during the test. Herein, however, lies a potential problem. These optimal CAT items result in an examinee having only a 50\% chance of a correct response. Some examinees may consider such items unduly challenging. Further, when test anxiety is a factor, it is possible that test scores may be negatively affected. This research was undertaken to determine the effects of administering easier CAT items on ability estimation and test length using computer simulations. Also considered was the administration of different numbers of initial items prior to the start of the adaptive portion of the test, using three different levels of measurement precision. Results indicate that regardless of the number of initial items administered, the level of precision employed, or the modifications made to item difficulty, the approximation of estimated ability to true ability is good in all cases. Additionally, the standard deviations of the ability estimates closely approximate the theoretical levels of precision used as stopping rules for the simulated CATs. Since optimal CAT items are not used, each item administered provides less information about examinees than optimal CAT items. This results in longer tests. Fortunately, using easier items that provide up to a 66.4\% chance of a correct response results in tests that only modestly increase in length, across levels of precision. For larger standard errors, even easier items (up to a 73.5\% chance of a correct response) result in only negligible to modest increases in test length. Examinees who find optimal CAT items difficult or examinees with test anxiety may find CATs that implement easier items enhance the already existing benefits of CAT. (PsycINFO Database Record (c) 2003 APA, all rights reserved).}, keywords = {computerized adaptive testing}, author = {Grodenchik, D. J.} } @article {12, title = {Information technology and literacy assessment}, journal = {Reading and Writing Quarterly}, volume = {18}, number = {4}, year = {2002}, pages = {369-373}, abstract = {This column discusses information technology and literacy assessment in the past and present. The author also describes computer-based assessments today including the following topics: computer-scored testing, computer-administered formal assessment, Internet formal assessment, computerized adaptive tests, placement tests, informal assessment, electronic portfolios, information management, and Internet information dissemination. A model of the major present-day applications of information technologies in reading and literacy assessment is also included. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Computer Applications, Computer Assisted Testing, Information, Internet, Literacy, Models, Systems, Technology}, author = {Balajthy, E.} } @article {354, title = {An item response model for characterizing test compromise}, journal = {Journal of Educational and Behavioral Statistics}, volume = {27}, number = {2}, year = {2002}, note = {References .American Educational Research Assn, US}, pages = {163-179}, abstract = {This article presents an item response model for characterizing test-compromise that enables the estimation of item-preview and score-gain distributions observed in on-demand high-stakes testing programs. Model parameters and posterior distributions are estimated by Markov Chain Monte Carlo (MCMC) procedures. Results of a simulation study suggest that when at least some of the items taken by a small sample of test takers are known to be secure (uncompromised), the procedure can provide useful summaries of test-compromise and its impact on test scores. The article includes discussions of operational use of the proposed procedure, possible model violations and extensions, and application to computerized adaptive testing. }, keywords = {computerized adaptive testing}, author = {Segall, D. O.} } @article {418, title = {Mathematical-programming approaches to test item pool design}, number = {RR 02-09}, year = {2002}, note = {Using Smart Source ParsingAdvances in psychology research, Vol. ( Hauppauge, NY : Nova Science Publishers, Inc, [URL:http://www.Novapublishers.com]. vi, 228 pp}, pages = {93-108}, institution = {University of Twente, Faculty of Educational Science and Technology}, address = {Twente, The Netherlands}, abstract = {(From the chapter) This paper presents an approach to item pool design that has the potential to improve on the quality of current item pools in educational and psychological testing and hence to increase both measurement precision and validity. The approach consists of the application of mathematical programming techniques to calculate optimal blueprints for item pools. These blueprints can be used to guide the item-writing process. Three different types of design problems are discussed, namely for item pools for linear tests, item pools computerized adaptive testing (CAT), and systems of rotating item pools for CAT. The paper concludes with an empirical example of the problem of designing a system of rotating item pools for CAT.}, keywords = {Adaptive Testing, Computer Assisted, Computer Programming, Educational Measurement, Item Response Theory, Mathematics, Psychometrics, Statistical Rotation computerized adaptive testing, Test Items, Testing}, isbn = {02-09}, author = {Veldkamp, B. P. and van der Linden, W. J. and Ariel, A.} } @article {50, title = {Measuring quality of life in chronic illness: the functional assessment of chronic illness therapy measurement system}, journal = {Archives of Physical Medicine and Rehabilitation}, volume = {83}, number = {12 Suppl 2}, year = {2002}, note = {0003-9993Journal Article}, month = {Dec}, pages = {S10-7}, abstract = {We focus on quality of life (QOL) measurement as applied to chronic illness. There are 2 major types of health-related quality of life (HRQOL) instruments-generic health status and targeted. Generic instruments offer the opportunity to compare results across patient and population cohorts, and some can provide normative or benchmark data from which to interpret results. Targeted instruments ask questions that focus more on the specific condition or treatment under study and, as a result, tend to be more responsive to clinically important changes than generic instruments. Each type of instrument has a place in the assessment of HRQOL in chronic illness, and consideration of the relative advantages and disadvantages of the 2 options best drives choice of instrument. The Functional Assessment of Chronic Illness Therapy (FACIT) system of HRQOL measurement is a hybrid of the 2 approaches. The FACIT system combines a core general measure with supplemental measures targeted toward specific diseases, conditions, or treatments. Thus, it capitalizes on the strengths of each type of measure. Recently, FACIT questionnaires were administered to a representative sample of the general population with results used to derive FACIT norms. These normative data can be used for benchmarking and to better understand changes in HRQOL that are often seen in clinical trials. Future directions in HRQOL assessment include test equating, item banking, and computerized adaptive testing.}, keywords = {*Chronic Disease, *Quality of Life, *Rehabilitation, Adult, Comparative Study, Health Status Indicators, Humans, Psychometrics, Questionnaires, Research Support, U.S. Gov{\textquoteright}t, P.H.S., Sensitivity and Specificity}, author = {Cella, D. and Nowinski, C. J.} } @article {146, title = {Multidimensional adaptive testing for mental health problems in primary care}, journal = {Medical Care}, volume = {40}, number = {9}, year = {2002}, note = {Gardner, WilliamKelleher, Kelly JPajer, Kathleen AMCJ-177022/PHS HHS/MH30915/MH/NIMH NIH HHS/MH50629/MH/NIMH NIH HHS/Med Care. 2002 Sep;40(9):812-23.}, month = {Sep}, pages = {812-23}, edition = {2002/09/10}, abstract = {OBJECTIVES: Efficient and accurate instruments for assessing child psychopathology are increasingly important in clinical practice and research. For example, screening in primary care settings can identify children and adolescents with disorders that may otherwise go undetected. However, primary care offices are notorious for the brevity of visits and screening must not burden patients or staff with long questionnaires. One solution is to shorten assessment instruments, but dropping questions typically makes an instrument less accurate. An alternative is adaptive testing, in which a computer selects the items to be asked of a patient based on the patient{\textquoteright}s previous responses. This research used a simulation to test a child mental health screen based on this technology. RESEARCH DESIGN: Using half of a large sample of data, a computerized version was developed of the Pediatric Symptom Checklist (PSC), a parental-report psychosocial problem screen. With the unused data, a simulation was conducted to determine whether the Adaptive PSC can reproduce the results of the full PSC with greater efficiency. SUBJECTS: PSCs were completed by parents on 21,150 children seen in a national sample of primary care practices. RESULTS: Four latent psychosocial problem dimensions were identified through factor analysis: internalizing problems, externalizing problems, attention problems, and school problems. A simulated adaptive test measuring these traits asked an average of 11.6 questions per patient, and asked five or fewer questions for 49\% of the sample. There was high agreement between the adaptive test and the full (35-item) PSC: only 1.3\% of screening decisions were discordant (kappa = 0.93). This agreement was higher than that obtained using a comparable length (12-item) short-form PSC (3.2\% of decisions discordant; kappa = 0.84). CONCLUSIONS: Multidimensional adaptive testing may be an accurate and efficient technology for screening for mental health problems in primary care settings.}, keywords = {Adolescent, Child, Child Behavior Disorders/*diagnosis, Child Health Services/*organization \& administration, Factor Analysis, Statistical, Female, Humans, Linear Models, Male, Mass Screening/*methods, Parents, Primary Health Care/*organization \& administration}, isbn = {0025-7079 (Print)0025-7079 (Linking)}, author = {Gardner, W. and Kelleher, K. J. and Pajer, K. A.} } @article {277, title = {Outlier detection in high-stakes certification testing}, journal = {Journal of Educational Measurement}, volume = {39}, number = {3}, year = {2002}, pages = {219-233}, abstract = {Discusses recent developments of person-fit analysis in computerized adaptive testing (CAT). Methods from statistical process control are presented that have been proposed to classify an item score pattern as fitting or misfitting the underlying item response theory model in CAT Most person-fit research in CAT is restricted to simulated data. In this study, empirical data from a certification test were used. Alternatives are discussed to generate norms so that bounds can be determined to classify an item score pattern as fitting or misfitting. Using bounds determined from a sample of a high-stakes certification test, the empirical analysis showed that different types of misfit can be distinguished Further applications using statistical process control methods to detect misfitting item score patterns are discussed. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Adaptive Testing, computerized adaptive testing, Educational Measurement, Goodness of Fit, Item Analysis (Statistical), Item Response Theory, person Fit, Statistical Estimation, Statistical Power, Test Scores}, author = {Meijer, R. R.} } @article {346, title = {A structure-based approach to psychological measurement: Matching measurement models to latent structure}, journal = {Assessment}, volume = {9}, number = {1}, year = {2002}, pages = {4-16}, abstract = {The present article sets forth the argument that psychological assessment should be based on a construct{\textquoteright}s latent structure. The authors differentiate dimensional (continuous) and taxonic (categorical) structures at the latent and manifest levels and describe the advantages of matching the assessment approach to the latent structure of a construct. A proper match will decrease measurement error, increase statistical power, clarify statistical relationships, and facilitate the location of an efficient cutting score when applicable. Thus, individuals will be placed along a continuum or assigned to classes more accurately. The authors briefly review the methods by which latent structure can be determined and outline a structure-based approach to assessment that builds on dimensional scaling models, such as item response theory, while incorporating classification methods as appropriate. Finally, the authors empirically demonstrate the utility of their approach and discuss its compatibility with traditional assessment methods and with computerized adaptive testing. (PsycINFO Database Record (c) 2005 APA ) (journal abstract)}, keywords = {Adaptive Testing, Assessment, Classification (Cognitive Process), Computer Assisted, Item Response Theory, Psychological, Scaling (Testing), Statistical Analysis computerized adaptive testing, Taxonomies, Testing}, author = {Ruscio, John and Ruscio, Ayelet Meron} } @inbook {108, title = {The work ahead: A psychometric infrastructure for computerized adaptive tests}, booktitle = {Computer-based tests: Building the foundation for future assessment}, year = {2002}, note = {Using Smart Source ParsingComputer-based testing: Building the foundation for future assessments. (pp. 1-35). Mahwah, NJ : Lawrence Erlbaum Associates, Publishers. xi, 326 pp}, publisher = {Lawrence Erlbaum Associates, Inc.}, organization = {Lawrence Erlbaum Associates, Inc.}, address = {Mahwah, N.J. USA}, abstract = {(From the chapter) Considers the past and future of computerized adaptive tests and computer-based tests and looks at issues and challenges confronting a testing program as it implements and operates a computer-based test. Recommendations for testing programs from The National Council of Measurement in Education Ad Hoc Committee on Computerized Adaptive Test Disclosure are appended. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Adaptive Testing, Computer Assisted Testing, Educational, Measurement, Psychometrics}, author = {F Drasgow}, editor = {M. P. Potenza and J. J. Freemer and W. C. Ward} } @article {89, title = {Assessment in the twenty-first century: A role of computerised adaptive testing in national curriculum subjects}, journal = {Teacher Development}, volume = {5}, number = {2}, year = {2001}, pages = {241-57}, abstract = {With the investment of large sums of money in new technologies forschools and education authorities and the subsequent training of teachers to integrate Information and Communications Technology (ICT) into their teaching strategies, it is remarkable that the old outdated models of assessment still remain. This article highlights the current problems associated with pen-and paper-testing and offers suggestions for an innovative and new approach to assessment for the twenty-first century. Based on the principle of the {\textquoteright}wise examiner{\textquoteright} a computerised adaptive testing system which measures pupils{\textquoteright} ability against the levels of the United Kingdom National Curriculum has been developed for use in mathematics. Using constructed response items, pupils are administered a test tailored to their ability with a reliability index of 0.99. Since the software administers maximally informative questions matched to each pupil{\textquoteright}s current ability estimate, no two pupils will receive the same set of items in the same order therefore removing opportunities for plagarism and teaching to the test. All marking is automated and a journal recording the outcome of the test and highlighting the areas of difficulty for each pupil is available for printing by the teacher. The current prototype of the system can be used on a school{\textquoteright}s network however the authors envisage a day when Examination Boards or the Qualifications and Assessment Authority (QCA) will administer Government tests from a central server to all United Kingdom schools or testing centres. Results will be issued at the time of testing and opportunities for resits will become more widespr}, keywords = {computerized adaptive testing}, author = {Cowan, P. and Morrison, H.} } @article {55, title = {a-stratified multistage computerized adaptive testing with b blocking}, journal = {Applied Psychological Measurement}, volume = {25}, number = {4}, year = {2001}, pages = {333-41}, abstract = {Proposed a refinement, based on the stratification of items developed by D. Weiss (1973), of the computerized adaptive testing item selection procedure of H. Chang and Z. Ying (1999). Simulation studies using an item bank from the Graduate Record Examination show the benefits of the new procedure. (SLD)}, keywords = {computerized adaptive testing}, author = {Chang, Hua-Hua and Qian, J. and Yang, Z.} } @article {336, title = {Computerized adaptive testing with the generalized graded unfolding model}, journal = {Applied Psychological Measurement}, volume = {25}, number = {2}, year = {2001}, pages = {177-196}, abstract = {Examined the use of the generalized graded unfolding model (GGUM) in computerized adaptive testing. The objective was to minimize the number of items required to produce equiprecise estimates of person locations. Simulations based on real data about college student attitudes toward abortion and on data generated to fit the GGUM were used. It was found that as few as 7 or 8 items were needed to produce accurate and precise person estimates using an expected a posteriori procedure. The number items in the item bank (20, 40, or 60 items) and their distribution on the continuum (uniform locations or item clusters in moderately extreme locations) had only small effects on the accuracy and precision of the estimates. These results suggest that adaptive testing with the GGUM is a good method for achieving estimates with an approximately uniform level of precision using a small number of items. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Attitude Measurement, College Students computerized adaptive testing, Computer Assisted Testing, Item Response, Models, Statistical Estimation, Theory}, author = {Roberts, J. S. and Lin, Y. and Laughlin, J. E.} } @article {358, title = {Developments in measurement of persons and items by means of item response models}, journal = {Behaviormetrika}, volume = {28}, number = {1}, year = {2001}, pages = {65-94}, abstract = {This paper starts with a general introduction into measurement of hypothetical constructs typical of the social and behavioral sciences. After the stages ranging from theory through operationalization and item domain to preliminary test or questionnaire have been treated, the general assumptions of item response theory are discussed. The family of parametric item response models for dichotomous items is introduced and it is explained how parameters for respondents and items are estimated from the scores collected from a sample of respondents who took the test or questionnaire. Next, the family of nonparametric item response models is explained, followed by the 3 classes of item response models for polytomous item scores (e.g., rating scale scores). Then, to what degree the mean item score and the unweighted sum of item scores for persons are useful for measuring items and persons in the context of item response theory is discussed. Methods for fitting parametric and nonparametric models to data are briefly discussed. Finally, the main applications of item response models are discussed, which include equating and item banking, computerized and adaptive testing, research into differential item functioning, person fit research, and cognitive modeling. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Cognitive, Computer Assisted Testing, Item Response Theory, Models, Nonparametric Statistical Tests, Processes}, author = {Sijtsma, K.} } @article {315, title = {Differences between self-adapted and computerized adaptive tests: A meta-analysis}, journal = {Journal of Educational Measurement}, volume = {38}, number = {3}, year = {2001}, pages = {235-247}, abstract = {Self-adapted testing has been described as a variation of computerized adaptive testing that reduces test anxiety and thereby enhances test performance. The purpose of this study was to gain a better understanding of these proposed effects of self-adapted tests (SATs); meta-analysis procedures were used to estimate differences between SATs and computerized adaptive tests (CATs) in proficiency estimates and post-test anxiety levels across studies in which these two types of tests have been compared. After controlling for measurement error the results showed that SATs yielded proficiency estimates that were 0.12 standard deviation units higher and post-test anxiety levels that were 0.19 standard deviation units lower than those yielded by CATs. The authors speculate about possible reasons for these differences and discuss advantages and disadvantages of using SATs in operational settings. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Adaptive Testing, Computer Assisted Testing, Scores computerized adaptive testing, Test, Test Anxiety}, author = {Pitkin, A. K. and Vispoel, W. P.} } @article {90, title = {Final answer?}, journal = {American School Board Journal}, volume = {188}, number = {3}, year = {2001}, pages = {24-26}, abstract = {The Northwest Evaluation Association helped an Indiana school district develop a computerized adaptive testing system that was aligned with its curriculum and geared toward measuring individual student growth. Now the district can obtain such information from semester to semester and year to year, get immediate results, and test students on demand. (MLH)}, keywords = {computerized adaptive testing}, author = {Coyle, J.} } @article {188, title = {Item selection in computerized adaptive testing: Should more discriminating items be used first?}, journal = {Journal of Educational Measurement}, volume = {38}, number = {3}, year = {2001}, pages = {249-266}, abstract = {During computerized adaptive testing (CAT), items are selected continuously according to the test-taker{\textquoteright}s estimated ability. Test security has become a problem because high-discrimination items are more likely to be selected and become overexposed. So, there seems to be a tradeoff between high efficiency in ability estimations and balanced usage of items. This series of four studies addressed the dilemma by focusing on the notion of whether more or less discriminating items should be used first in CAT. The first study demonstrated that the common maximum information method with J. B. Sympson and R. D. Hetter (1985) control resulted in the use of more discriminating items first. The remaining studies showed that using items in the reverse order, as described in H. Chang and Z. Yings (1999) stratified method had potential advantages: (a) a more balanced item usage and (b) a relatively stable resultant item pool structure with easy and inexpensive management. This stratified method may have ability-estimation efficiency better than or close to that of other methods. It is argued that the judicious selection of items, as in the stratified method, is a more active control of item exposure. (PsycINFO Database Record (c) 2005 APA )}, keywords = {ability, Adaptive Testing, Computer Assisted Testing, Estimation, Statistical, Test Items computerized adaptive testing}, author = {Hau, Kit-Tai and Chang, Hua-Hua} } @article {392, title = {Multidimensional adaptive testing using the weighted likelihood estimation}, journal = {Dissertation Abstracts International Section A: Humanities \& Social Sciences}, volume = {61}, number = {12-A}, year = {2001}, pages = {4746}, abstract = {This study extended Warm{\textquoteright}s (1989) weighted likelihood estimation (WLE) to a multidimensional computerized adaptive test (MCAT) setting. WLE was compared with the maximum likelihood estimation (MLE), expected a posteriori (EAP), and maximum a posteriori (MAP) using a three-dimensional 3PL IRT model under a variety of computerized adaptive testing conditions. The dependent variables included bias, standard error of ability estimates (SE), square root of mean square error (RMSE), and test information. The independent variables were ability estimation methods, intercorrelation levels between dimensions, multidimensional structures, and ability combinations. Simulation results were presented in terms of descriptive statistics, such as figures and tables. In addition, inferential procedures were used to analyze bias by conceptualizing this Monte Carlo study as a statistical sampling experiment. The results of this study indicate that WLE and the other three estimation methods yield significantly more accurate ability estimates under an approximate simple test structure with one dominant dimension and several secondary dimensions. All four estimation methods, especially WLE, yield very large SEs when a three equally dominant multidimensional structure was employed. Consistent with previous findings based on unidimensional IRT model, MLE and WLE are less biased in the extreme of the ability scale; MLE and WLE yield larger SEs than the Bayesian methods; test information-based SEs underestimate actual SEs for both MLE and WLE in MCAT situations, especially at shorter test lengths; WLE reduced the bias of MLE under the approximate simple structure; test information-based SEs underestimates the actual SEs of MLE and WLE estimators in the MCAT conditions, similar to the findings of Warm (1989) in the unidimensional case. The results from the MCAT simulations did show some advantages of WLE in reducing the bias of MLE under the approximate simple structure with a fixed test length of 50 items, which was consistent with the previous research findings based on different unidimensional models. It is clear from the current results that all four methods perform very poorly when the multidimensional structures with multiple dominant factors were employed. More research efforts are urged to investigate systematically how different multidimensional structures affect the accuracy and reliability of ability estimation. Based on the simulated results in this study, there is no significant effect found on the ability estimation from the intercorrelation between dimensions. (PsycINFO Database Record (c) 2003 APA, all rights reserved).}, keywords = {computerized adaptive testing}, author = {Tseng, F-L.} } @article {279, title = {Nouveaux d{\'e}veloppements dans le domaine du testing informatis{\'e} [New developments in the area of computerized testing]}, journal = {Psychologie Fran{\c c}aise}, volume = {46}, number = {3}, year = {2001}, pages = {221-230}, abstract = {L{\textquoteright}usage de l{\textquoteright}{\'e}valuation assist{\'e}e par ordinateur s{\textquoteright}est fortement d{\'e}velopp{\'e} depuis la premi{\`e}re formulation de ses principes de base dans les ann{\'e}es soixante et soixante-dix. Cet article offre une introduction aux derniers d{\'e}veloppements dans le domaine de l{\textquoteright}{\'e}valuation assist{\'e}e par ordinateur, en particulier celui du testing adaptative informatis{\'e}e (TAI). L{\textquoteright}estimation de l{\textquoteright}aptitude, la s{\'e}lection des items et le d{\'e}veloppement d{\textquoteright}une base d{\textquoteright}items dans le cas du TAI sont discut{\'e}s. De plus, des exemples d{\textquoteright}utilisations innovantes de l{\textquoteright}ordinateur dans des syst{\`e}mes int{\'e}gr{\'e}s de testing et de testing via Internet sont pr{\'e}sent{\'e}s. L{\textquoteright}article se termine par quelques illustrations de nouvelles applications du testing informatis{\'e} et des suggestions pour des recherches futures.Discusses the latest developments in computerized psychological assessment, with emphasis on computerized adaptive testing (CAT). Ability estimation, item selection, and item pool development in CAT are described. Examples of some innovative approaches to CAT are presented. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Adaptive Testing, Computer Applications, Computer Assisted, Diagnosis, Psychological Assessment computerized adaptive testing}, author = {Meijer, R. R. and Gr{\'e}goire, J.} } @article {39, title = {Outlier measures and norming methods for computerized adaptive tests}, journal = {Journal of Educational and Behavioral Statistics}, volume = {26}, number = {1}, year = {2001}, pages = {85-104}, abstract = {Notes that the problem of identifying outliers has 2 important aspects: the choice of outlier measures and the method to assess the degree of outlyingness (norming) of those measures. Several classes of measures for identifying outliers in Computerized Adaptive Tests (CATs) are introduced. Some of these measures are constructed to take advantage of CATs{\textquoteright} sequential choice of items; other measures are taken directly from paper and pencil (P\&P) tests and are used for baseline comparisons. Assessing the degree of outlyingness of CAT responses, however, can not be applied directly from P\&P tests because stopping rules associated with CATs yield examinee responses of varying lengths. Standard outlier measures are highly correlated with the varying lengths which makes comparison across examinees impossible. Therefore, 4 methods are presented and compared which map outlier statistics to a familiar probability scale (a p value). The methods are explored in the context of CAT data from a 1995 Nationally Administered Computerized Examination (NACE). (PsycINFO Database Record (c) 2005 APA )}, keywords = {Adaptive Testing, Computer Assisted Testing, Statistical Analysis, Test Norms}, author = {Bradlow, E. T. and Weiss, R. E.} } @article {383, title = {Pasado, presente y futuro de los test adaptativos informatizados: Entrevista con Isaac I. B{\'e}jar [Past, present and future of computerized adaptive testing: Interview with Isaac I. B{\'e}jar]}, journal = {Psicothema}, volume = {13}, number = {4}, year = {2001}, pages = {685-690}, abstract = {En este art{\'\i}culo se presenta el resultado de una entrevista con Isaac I. Bejar. El Dr. Bejar es actualmente Investigador Cient{\'\i}fico Principal y Director del Centro para el Dise{\~n}o de Evaluaci{\'o}n y Sistemas de Puntuaci{\'o}n perteneciente a la Divisi{\'o}n de Investigaci{\'o}n del Servicio de Medici{\'o}n Educativa (Educa - tional Testing Service, Princeton, NJ, EE.UU.). El objetivo de esta entrevista fue conversar sobre el pasado, presente y futuro de los Tests Adaptativos Informatizados. En la entrevista se recogen los inicios de los Tests Adaptativos y de los Tests Adaptativos Informatizados y {\'u}ltimos avances que se desarrollan en el Educational Testing Service sobre este tipo de tests (modelos generativos, isomorfos, puntuaci{\'o}n autom{\'a}tica de {\'\i}tems de ensayo{\textellipsis}). Se finaliza con la visi{\'o}n de futuro de los Tests Adaptativos Informatizados y su utilizaci{\'o}n en Espa{\~n}a.Past, present and future of Computerized Adaptive Testing: Interview with Isaac I. Bejar. In this paper the results of an interview with Isaac I. Bejar are presented. Dr. Bejar is currently Principal Research Scientist and Director of Center for Assessment Design and Scoring, in Research Division at Educational Testing Service (Princeton, NJ, U.S.A.). The aim of this interview was to review the past, present and future of the Computerized Adaptive Tests. The beginnings of the Adaptive Tests and Computerized Adaptive Tests, and the latest advances developed at the Educational Testing Service (generative response models, isomorphs, automated scoring{\textellipsis}) are reviewed. The future of Computerized Adaptive Tests is analyzed, and its utilization in Spain commented.}, keywords = {computerized adaptive testing}, isbn = {0214-9915}, author = {Tejada, R. and Antonio, J.} } @inbook {362, title = {Practical issues in setting standards on computerized adaptive tests}, booktitle = {Setting performance standards: Concepts, methods, and perspectives}, year = {2001}, note = {Using Smart Source ParsingSetting performance standards: Concepts, methods, and perspectives. (pp. 355-369). Mahwah, NJ : Lawrence Erlbaum Associates, Publishers. xiii, 510 pp}, pages = {355-369}, publisher = {Lawrence Erlbaum Associates, Inc.}, organization = {Lawrence Erlbaum Associates, Inc.}, address = {Mahwah, N.J. USA}, abstract = {(From the chapter) Examples of setting standards on computerized adaptive tests (CATs) are hard to find. Some examples of CATs involving performance standards include the registered nurse exam and the Novell systems engineer exam. Although CATs do not require separate standard setting-methods, there are special issues to be addressed by test specialist who set performance standards on CATs. Setting standards on a CAT will typical require modifications on the procedures used with more traditional, fixed-form, paper-and -pencil examinations. The purpose of this chapter is to illustrate why CATs pose special challenges to the standard setter. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Adaptive Testing, Computer Assisted Testing, Performance Tests, Testing Methods}, author = {Sireci, S. G. and Clauser, B. E.} } @article {297, title = {Requerimientos, aplicaciones e investigaci{\'o}n en tests adaptativos informatizados [Requirements, applications, and investigation in computerized adaptive testing]}, journal = {Apuntes de Psicologia}, volume = {19}, number = {1}, year = {2001}, pages = {11-28}, abstract = {Summarizes the main requirements and applications of computerized adaptive testing (CAT) with emphasis on the differences between CAT and conventional computerized tests. Psychometric properties of estimations based on CAT, item selection strategies, and implementation software are described. Results of CAT studies in Spanish-speaking samples are described. Implications for developing a CAT measuring the English vocabulary of Spanish-speaking students are discussed. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Computer Assisted Testing, English as Second Language, Psychometrics computerized adaptive testing}, author = {Olea D{\'\i}az, J. and Ponsoda Gil, V. and Revuelta Men{\'e}ndez, J. and Hontangas Beltr{\'a}n, P. and Abad, F. J.} } @article {197, title = {Toepassing van een computergestuurde adaptieve testprocedure op persoonlijkheidsdata [Application of a computerised adaptive test procedure on personality data]}, journal = {Nederlands Tijdschrift voor de Psychologie en haar Grensgebieden}, volume = {56}, number = {3}, year = {2001}, pages = {119-133}, abstract = {Studied the applicability of a computerized adaptive testing procedure to an existing personality questionnaire within the framework of item response theory. The procedure was applied to the scores of 1,143 male and female university students (mean age 21.8 yrs) in the Netherlands on the Neuroticism scale of the Amsterdam Biographical Questionnaire (G. J. Wilde, 1963). The graded response model (F. Samejima, 1969) was used. The quality of the adaptive test scores was measured based on their correlation with test scores for the entire item bank and on their correlation with scores on other scales from the personality test. The results indicate that computerized adaptive testing can be applied to personality scales. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Adaptive Testing, Computer Applications, Computer Assisted Testing, Personality Measures, Test Reliability computerized adaptive testing}, author = {Hol, A. M. and Vorst, H. C. M. and Mellenbergh, G. J.} } @article {107, title = {Algoritmo mixto m{\'\i}nima entrop{\'\i}a-m{\'a}xima informaci{\'o}n para la selecci{\'o}n de {\'\i}tems en un test adaptativo informatizado}, journal = {Psicothema}, volume = {12}, number = {2}, year = {2000}, pages = {12-14}, abstract = {El objetivo del estudio que presentamos es comparar la eficacia como estrat egia de selecci{\'o}n de {\'\i}tems de tres algo ritmos dife rentes: a) basado en m{\'a}xima info rmaci{\'o}n; b) basado en m{\'\i}nima entrop{\'\i}a; y c) mixto m{\'\i}nima entrop{\'\i}a en los {\'\i}tems iniciales y m{\'a}xima info rmaci{\'o}n en el resto; bajo la hip{\'o}tesis de que el algo ritmo mixto, puede dotar al TAI de mayor eficacia. Las simulaciones de procesos TAI se re a l i z a ron sobre un banco de 28 {\'\i}tems de respuesta graduada calibrado seg{\'u}n el modelo de Samejima, tomando como respuesta al TAI la respuesta ori ginal de los sujetos que fueron utilizados para la c a l i b raci{\'o}n. Los resultados iniciales mu e s t ran c{\'o}mo el cri t e rio mixto es m{\'a}s eficaz que cualquiera de los otros dos tomados indep e n d i e n t e m e n t e. Dicha eficacia se maximiza cuando el algo ritmo de m{\'\i}nima entrop{\'\i}a se re s t ri n ge a la selecci{\'o}n de los pri m e ros {\'\i}tems del TAI, ya que con las respuestas a estos pri m e ros {\'\i}tems la estimaci{\'o}n de q comienza a ser re l evante y el algo ritmo de m{\'a}xima informaci{\'o}nse optimiza.Item selection algo rithms in computeri zed adap t ive testing. The aim of this paper is to compare the efficacy of three different item selection algo rithms in computeri zed adap t ive testing (CAT). These algorithms are based as follows: the first one is based on Item Info rm ation, the second one on Entropy, and the last algo rithm is a mixture of the two previous ones. The CAT process was simulated using an emotional adjustment item bank. This item bank contains 28 graded items in six categories , calibrated using Samejima (1969) Graded Response Model. The initial results show that the mixed criterium algorithm performs better than the other ones.}, keywords = {computerized adaptive testing}, author = {Dorronsoro, J. R. and Santa-Cruz, C. and Rubio Franco, V. J. and Aguado Garc{\'\i}a, D.} } @article {402, title = {Capitalization on item calibration error in adaptive testing}, journal = {Applied Measurement in Education}, volume = {13}, number = {1}, year = {2000}, note = {References .Lawrence Erlbaum, US}, pages = {35-53}, abstract = {(from the journal abstract) In adaptive testing, item selection is sequentially optimized during the test. Because the optimization takes place over a pool of items calibrated with estimation error, capitalization on chance is likely to occur. How serious the consequences of this phenomenon are depends not only on the distribution of the estimation errors in the pool or the conditional ratio of the test length to the pool size given ability, but may also depend on the structure of the item selection criterion used. A simulation study demonstrated a dramatic impact of capitalization on estimation errors on ability estimation. Four different strategies to minimize the likelihood of capitalization on error in computerized adaptive testing are discussed.}, keywords = {computerized adaptive testing}, author = {van der Linden, W. J. and Glas, C. A. W.} } @article {309, title = {A comparison of computerized adaptive testing and multistage testing}, journal = {Dissertation Abstracts International: Section B: the Sciences \& Engineering}, volume = {60}, number = {11-B}, year = {2000}, pages = {5829}, abstract = {There is considerable evidence to show that computerized-adaptive testing (CAT) and multi-stage testing (MST) are viable frameworks for testing. With many testing organizations looking to move towards CAT or MST, it is important to know what framework is superior in different situations and at what cost in terms of measurement. What was needed is a comparison of the different testing procedures under various realistic testing conditions. This dissertation addressed the important problem of the increase or decrease in accuracy of ability estimation in using MST rather than CAT. The purpose of this study was to compare the accuracy of ability estimates produced by MST and CAT while keeping some variables fixed and varying others. A simulation study was conducted to investigate the effects of several factors on the accuracy of ability estimation using different CAT and MST designs. The factors that were manipulated are the number of stages, the number of subtests per stage, and the number of items per subtest. Kept constant were test length, distribution of subtest information, method of determining cut-points on subtests, amount of overlap between subtests, and method of scoring total test. The primary question of interest was, given a fixed test length, how many stages and many subtests per stage should there be to maximize measurement precision? Furthermore, how many items should there be in each subtest? Should there be more in the routing test or should there be more in the higher stage tests? Results showed that, in general, increasing the number of stages from two to three decreased the amount of errors in ability estimation. Increasing the number of subtests from three to five increased the accuracy of ability estimates as well as the efficiency of the MST designs relative to the P\&P and CAT designs at most ability levels (-.75 to 2.25). Finally, at most ability levels (-.75 to 2.25), varying the number of items per stage had little effect on either the resulting accuracy of ability estimates or the relative efficiency of the MST designs to the P\&P and CAT designs. (PsycINFO Database Record (c) 2003 APA, all rights reserved).}, keywords = {computerized adaptive testing}, author = {Patsula, L N.} } @article {70, title = {A comparison of item selection rules at the early stages of computerized adaptive testing}, journal = {Applied Psychological Measurement}, volume = {24}, number = {3}, year = {2000}, pages = {241-255}, abstract = {The effects of 5 item selection rules--Fisher information (FI), Fisher interval information (FII), Fisher information with a posterior distribution (FIP), Kullback-Leibler information (KL), and Kullback-Leibler information with a posterior distribution (KLP)--were compared with respect to the efficiency and precision of trait (θ) estimation at the early stages of computerized adaptive testing (CAT). FII, FIP, KL, and KLP performed marginally better than FI at the early stages of CAT for θ=-3 and -2. For tests longer than 10 items, there appeared to be no precision advantage for any of the selection rules. (PsycINFO Database Record (c) 2005 APA ) (journal abstract)}, keywords = {Adaptive Testing, Computer Assisted Testing, Item Analysis (Test), Statistical Estimation computerized adaptive testing}, author = {Chen, S-Y. and Ankenmann, R. D. and Chang, Hua-Hua} } @inbook {255, title = {Computer-adaptive testing: A methodology whose time has come}, booktitle = {Development of Computerised Middle School Achievement Tests}, volume = {69}, year = {2000}, publisher = {MESA}, organization = {MESA}, address = {Chicago, IL. USA}, keywords = {computerized adaptive testing}, author = {Linacre, J. M.}, editor = {Kang, U. and Jean, E. and Linacre, J. M.} } @article {329, title = {Computerization and adaptive administration of the NEO PI-R}, journal = {Assessment}, volume = {7}, number = {4}, year = {2000}, note = {1073-1911 (Print)Journal Article}, pages = {347-64}, abstract = {This study asks, how well does an item response theory (IRT) based computerized adaptive NEO PI-R work? To explore this question, real-data simulations (N = 1,059) were used to evaluate a maximum information item selection computerized adaptive test (CAT) algorithm. Findings indicated satisfactory recovery of full-scale facet scores with the administration of around four items per facet scale. Thus, the NEO PI-R could be reduced in half with little loss in precision by CAT administration. However, results also indicated that the CAT algorithm was not necessary. We found that for many scales, administering the "best" four items per facet scale would have produced similar results. In the conclusion, we discuss the future of computerized personality assessment and describe the role IRT methods might play in such assessments.}, keywords = {*Personality Inventory, Algorithms, California, Diagnosis, Computer-Assisted/*methods, Humans, Models, Psychological, Psychometrics/methods, Reproducibility of Results}, author = {Reise, S. P. and Henson, J. M.} } @article {115, title = {Computerized adaptive testing for classifying examinees into three categories}, journal = {Educational and Psychological Measurement}, volume = {60}, number = {5}, year = {2000}, pages = {713-34}, abstract = {The objective of this study was to explore the possibilities for using computerized adaptive testing in situations in which examinees are to be classified into one of three categories.Testing algorithms with two different statistical computation procedures are described and evaluated. The first computation procedure is based on statistical testing and the other on statistical estimation. Item selection methods based on maximum information (MI) considering content and exposure control are considered. The measurement quality of the proposed testing algorithms is reported. The results of the study are that a reduction of at least 22\% in the mean number of items can be expected in a computerized adaptive test (CAT) compared to an existing paper-and-pencil placement test. Furthermore, statistical testing is a promising alternative to statistical estimation. Finally, it is concluded that imposing constraints on the MI selection strategy does not negatively affect the quality of the testing algorithms}, keywords = {computerized adaptive testing, Computerized classification testing}, author = {Theo Eggen and Straetmans, G. J. J. M.} } @article {378, title = {The development of a computerized version of Vandenberg{\textquoteright}s mental rotation test and the effect of visuo-spatial working memory loading}, journal = {Dissertation Abstracts International Section A: Humanities and Social Sciences}, volume = {60}, number = {11-A}, year = {2000}, pages = {3938}, abstract = {This dissertation focused on the generation and evaluation of web-based versions of Vandenberg{\textquoteright}s Mental Rotation Test. Memory and spatial visualization theory were explored in relation to the addition of a visuo-spatial working memory component. Analysis of the data determined that there was a significant difference between scores on the MRT Computer and MRT Memory test. The addition of a visuo-spatial working memory component did significantly affect results at the .05 alpha level. Reliability and discrimination estimates were higher on the MRT Memory version. The computerization of the paper and pencil version on the MRT did not significantly effect scores but did effect the time required to complete the test. The population utilized in the quasi-experiment consisted of 107 university students from eight institutions in engineering graphics related courses. The subjects completed two researcher developed, Web-based versions of Vandenberg{\textquoteright}s Mental Rotation Test and the original paper and pencil version of the Mental Rotation Test. One version of the test included a visuo-spatial working memory loading. Significant contributions of this study included developing and evaluating computerized versions of Vandenberg{\textquoteright}s Mental Rotation Test. Previous versions of Vandenberg{\textquoteright}s Mental Rotation Test did not take advantage of the ability of the computer to incorporate an interaction factor, such as a visuo-spatial working memory loading, into the test. The addition of an interaction factor results in a more discriminate test which will lend itself well to computerized adaptive testing practices. Educators in engineering graphics related disciplines should strongly consider the use of spatial visualization tests to aid in establishing the effects of modern computer systems on fundamental design/drafting skills. Regular testing of spatial visualization skills will result assist in the creation of a more relevant curriculum. Computerized tests which are valid and reliable will assist in making this task feasible. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Computer Assisted Testing, Mental Rotation, Short Term Memory computerized adaptive testing, Test Construction, Test Validity, Visuospatial Memory}, author = {Strong, S. D.} } @article {351, title = {Diagnostische programme in der Demenzfr{\"u}herkennung: Der Adaptive Figurenfolgen-Lerntest (ADAFI) [Diagnostic programs in the early detection of dementia: The Adaptive Figure Series Learning Test (ADAFI)]}, journal = {Zeitschrift f{\"u}r Gerontopsychologie \& -Psychiatrie}, volume = {13}, number = {1}, year = {2000}, pages = {16-29}, abstract = {Zusammenfassung: Untersucht wurde die Eignung des computergest{\"u}tzten Adaptiven Figurenfolgen-Lerntests (ADAFI), zwischen gesunden {\"a}lteren Menschen und {\"a}lteren Menschen mit erh{\"o}htem Demenzrisiko zu differenzieren. Der im ADAFI vorgelegte Aufgabentyp der fluiden Intelligenzdimension (logisches Auff{\"u}llen von Figurenfolgen) hat sich in mehreren Studien zur Erfassung des intellektuellen Leistungspotentials (kognitive Plastizit{\"a}t) {\"a}lterer Menschen als g{\"u}nstig f{\"u}r die genannte Differenzierung erwiesen. Aufgrund seiner Konzeption als Diagnostisches Programm f{\"a}ngt der ADAFI allerdings einige Kritikpunkte an Vorgehensweisen in diesen bisherigen Arbeiten auf. Es konnte gezeigt werden, a) da{\ss} mit dem ADAFI deutliche Lokationsunterschiede zwischen den beiden Gruppen darstellbar sind, b) da{\ss} mit diesem Verfahren eine gute Vorhersage des mentalen Gesundheitsstatus der Probanden auf Einzelfallebene gelingt (Sensitivit{\"a}t: 80 \%, Spezifit{\"a}t: 90 \%), und c) da{\ss} die Vorhersageleistung statusdiagnostischer Tests zur Informationsverarbeitungsgeschwindigkeit und zum Arbeitsged{\"a}chtnis geringer ist. Die Ergebnisse weisen darauf hin, da{\ss} die plastizit{\"a}tsorientierte Leistungserfassung mit dem ADAFI vielversprechend f{\"u}r die Fr{\"u}hdiagnostik dementieller Prozesse sein k{\"o}nnte.The aim of this study was to examine the ability of the computerized Adaptive Figure Series Learning Test (ADAFI) to differentiate among old subjects at risk for dementia and old healthy controls. Several studies on the subject of measuring the intellectual potential (cognitive plasticity) of old subjects have shown the usefulness of the fluid intelligence type of task used in the ADAFI (completion of figure series) for this differentiation. Because the ADAFI has been developed as a Diagnostic Program it is able to counter some critical issues in those studies. It was shown a) that distinct differences between both groups are revealed by the ADAFI, b) that the prediction of the cognitive health status of individual subjects is quite good (sensitivity: 80 \%, specifity: 90 \%), and c) that the prediction of the cognitive health status with tests of processing speed and working memory is worse than with the ADAFI. The results indicate that the ADAFI might be a promising plasticity-oriented tool for the measurement of cognitive decline in the elderly, and thus might be useful for the early detection of dementia.}, keywords = {Adaptive Testing, At Risk Populations, Computer Assisted Diagnosis, Dementia}, author = {Schreiber, M. D. and Schneider, R. J. and Schweizer, A. and Beckmann, J. F. and Baltissen, R.} } @article {179, title = {Emergence of item response modeling in instrument development and data analysis}, journal = {Medical Care}, volume = {38}, number = {Suppl. 9}, year = {2000}, pages = {II60-II65}, keywords = {Computer Assisted Testing, Health, Item Response Theory, Measurement, Statistical Validity computerized adaptive testing, Test Construction, Treatment Outcomes}, author = {Hambleton, R. K.} } @article {74, title = {Estimation of trait level in computerized adaptive testing}, journal = {Applied Psychological Measurement}, volume = {24}, number = {3}, year = {2000}, pages = {257-265}, abstract = {Notes that in computerized adaptive testing (CAT), a examinee{\textquoteright}s trait level (θ) must be estimated with reasonable accuracy based on a small number of item responses. A successful implementation of CAT depends on (1) the accuracy of statistical methods used for estimating θ and (2) the efficiency of the item-selection criterion. Methods of estimating θ suitable for CAT are reviewed, and the differences between Fisher and Kullback-Leibler information criteria for selecting items are discussed. The accuracy of different CAT algorithms was examined in an empirical study. The results show that correcting θ estimates for bias was necessary at earlier stages of CAT, but most CAT algorithms performed equally well for tests of 10 or more items. (PsycINFO Database Record (c) 2005 APA )}, keywords = {(Statistical), Adaptive Testing, Computer Assisted Testing, Item Analysis, Statistical Estimation computerized adaptive testing}, author = {Cheng, P. E. and Liou, M.} } @article {41, title = {An examination of the reliability and validity of performance ratings made using computerized adaptive rating scales}, journal = {Dissertation Abstracts International: Section B: The Sciences and Engineering}, volume = {61}, number = {1-B}, year = {2000}, pages = {570}, abstract = {This study compared the psychometric properties of performance ratings made using recently-developed computerized adaptive rating scales (CARS) to the psyc hometric properties of ratings made using more traditional paper-and-pencil rati ng formats, i.e., behaviorally-anchored and graphic rating scales. Specifically, the reliability, validity and accuracy of the performance ratings from each for mat were examined. One hundred twelve participants viewed six 5-minute videotape s of office situations and rated the performance of a target person in each vide otape on three contextual performance dimensions-Personal Support, Organizationa l Support, and Conscientious Initiative-using CARS and either behaviorally-ancho red or graphic rating scales. Performance rating properties were measured using Shrout and Fleiss{\textquoteright}s intraclass correlation (2, 1), Borman{\textquoteright}s differential accurac y measure, and Cronbach{\textquoteright}s accuracy components as indexes of rating reliability, validity, and accuracy, respectively. Results found that performance ratings mad e using the CARS were significantly more reliable and valid than performance rat ings made using either of the other formats. Additionally, CARS yielded more acc urate performance ratings than the paper-and-pencil formats. The nature of the C ARS system (i.e., its adaptive nature and scaling methodology) and its paired co mparison judgment task are offered as possible reasons for the differences found in the psychometric properties of the performance ratings made using the variou s rating formats. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Adaptive Testing, Computer Assisted Testing, Performance Tests, Rating Scales, Reliability, Test, Test Validity}, author = {Buck, D. E.} } @article {203, title = {Los tests adaptativos informatizados en la frontera del siglo XXI: Una revisi{\'o}n [Computerized adaptive tests at the turn of the 21st century: A review]}, journal = {Metodolog{\'\i}a de las Ciencias del Comportamiento}, volume = {2}, number = {2}, year = {2000}, pages = {183-216}, keywords = {computerized adaptive testing}, isbn = {1575-9105}, author = {Hontangas, P. and Ponsoda, V. and Olea, J. and Abad, F. J.} } @article {317, title = {Overview of the computerized adaptive testing special section}, journal = {Psicol{\'o}gica}, volume = {21}, number = {1-2}, year = {2000}, pages = {115-120}, abstract = {This paper provides an overview of the five papers included in the Psicologica special section on computerized adaptive testing. A short introduction to this topic is presented as well. The main results, the links between the five papers and the general research topic to which they are more related are also shown. (PsycINFO Database Record (c) 2005 APA )}, keywords = {Adaptive Testing, Computers computerized adaptive testing}, author = {Ponsoda, V.} } @article {413, title = {Taylor approximations to logistic IRT models and their use in adaptive testing}, journal = {Journal of Educational and Behavioral Statistics}, volume = {25}, number = {3}, year = {2000}, pages = {307-343}, abstract = {Taylor approximation can be used to generate a linear approximation to a logistic ICC and a linear ability estimator. For a specific situation it will be shown to result in a special case of a Robbins-Monro item selection procedure for adaptive testing. The linear estimator can be used for the situation of zero and perfect scores when maximum likelihood estimation fails to come up with a finite estimate. It is also possible to use this estimator to generate starting values for maximum likelihood and weighted likelihood estimation. Approximations to the expectation and variance of the linear estimator for a sequence of Robbins-Monro item selections can be determined analytically. }, keywords = {computerized adaptive testing}, author = {Veerkamp, W. J. J.} } @article {276, title = {Alternative methods for the detection of item preknowledge in computerized adaptive testing}, journal = {Dissertation Abstracts International: Section B: the Sciences \& Engineering}, volume = {59}, number = {7-B}, year = {1999}, pages = {3765}, keywords = {computerized adaptive testing}, author = {McLeod, Lori Davis} } @article {59, title = {a-stratified multistage computerized adaptive testing}, journal = {Applied Psychological Measurement}, volume = {23}, number = {3}, year = {1999}, note = {Sage Publications, US}, pages = {211-222}, abstract = {For computerized adaptive tests (CAT) based on the three-parameter logistic mode it was found that administering items with low discrimination parameter (a) values early in the test and administering those with high a values later was advantageous; the skewness of item exposure distributions was reduced while efficiency was maintain in trait level estimation. Thus, a new multistage adaptive testing approach is proposed that factors a into the item selection process. In this approach, the items in the item bank are stratified into a number of levels based on their a values. The early stages of a test use items with lower as and later stages use items with higher as. At each stage, items are selected according to an optimization criterion from the corresponding level. Simulation studies were performed to compare a-stratified CATs with CATs based on the Sympson-Hetter method for controlling item exposure. Results indicated that this new strategy led to tests that were well-balanced, with respect to item exposure, and efficient. The a-stratified CATs achieved a lower average exposure rate than CATs based on Bayesian or information-based item selection and the Sympson-Hetter method. (PsycINFO Database Record (c) 2003 APA, all rights reserved).}, keywords = {computerized adaptive testing}, author = {Chang, Hua-Hua and Ying, Z.} } @inbook {26, title = {CAT for certification and licensure}, booktitle = {Innovations in computerized assessment}, year = {1999}, note = {Using Smart Source ParsingInnovations in computerized assessment. (pp. 67-91). xiv, 266pp}, pages = {67-91}, publisher = {Lawrence Erlbaum Associates}, organization = {Lawrence Erlbaum Associates}, address = {Mahwah, N.J.}, abstract = {(from the chapter) This chapter discusses implementing computerized adaptive testing (CAT) for high-stakes examinations that determine whether or not a particular candidate will be certified or licensed. The experience of several boards who have chosen to administer their licensure or certification examinations using the principles of CAT illustrates the process of moving into this mode of administration. Examples of the variety of options that can be utilized within a CAT administration are presented, the decisions that boards must make to implement CAT are discussed, and a timetable for completing the tasks that need to be accomplished is provided. In addition to the theoretical aspects of CAT, practical issues and problems are reviewed. (PsycINFO Database Record (c) 2002 APA, all rights reserved).}, keywords = {computerized adaptive testing}, author = {Bergstrom, Betty A. and Lunz, M. E.} } @article {28, title = {Competency gradient for child-parent centers}, journal = {Journal of Outcomes Measurement}, volume = {3}, number = {1}, year = {1999}, note = {1090-655X (Print)Journal ArticleResearch Support, U.S. Gov{\textquoteright}t, P.H.S.}, pages = {35-52}, abstract = {This report describes an implementation of the Rasch model during the longitudinal evaluation of a federally-funded early childhood preschool intervention program. An item bank is described for operationally defining a psychosocial construct called community life-skills competency, an expected teenage outcome of the preschool intervention. This analysis examined the position of teenage students on this scale structure, and investigated a pattern of cognitive operations necessary for students to pass community life-skills test items. Then this scale structure was correlated with nationally standardized reading and math achievement scores, teacher ratings, and school records to assess its validity as a measure of the community-related outcome goal for this intervention. The results show a functional relationship between years of early intervention and magnitude of effect on the life-skills competency variable.}, keywords = {*Models, Statistical, Activities of Daily Living/classification/psychology, Adolescent, Chicago, Child, Child, Preschool, Early Intervention (Education)/*statistics \& numerical data, Female, Follow-Up Studies, Humans, Male, Outcome and Process Assessment (Health Care)/*statistics \& numerical data}, author = {Bezruczko, N.} } @article {280, title = {Computerized Adaptive Testing: Overview and Introduction}, journal = {Applied Psychological Measurement}, volume = {23}, number = {3}, year = {1999}, pages = {187-94}, abstract = {Use of computerized adaptive testing (CAT) has increased substantially since it was first formulated in the 1970s. This paper provides an overview of CAT and introduces the contributions to this Special Issue. The elements of CAT discussed here include item selection procedures, estimation of the latent trait, item exposure, measurement precision, and item bank development. Some topics for future research are also presented. }, keywords = {computerized adaptive testing}, author = {Meijer, R. R. and Nering, M. L.} } @article {220, title = {The effect of model misspecification on classification decisions made using a computerized test}, journal = {Journal of Educational Measurement}, volume = {36}, number = {1}, year = {1999}, note = {National Council on Measurement in Education, US}, pages = {47-59}, abstract = {Many computerized testing algorithms require the fitting of some item response theory (IRT) model to examinees{\textquoteright} responses to facilitate item selection, the determination of test stopping rules, and classification decisions. Some IRT models are thought to be particularly useful for small volume certification programs that wish to make the transition to computerized adaptive testing (CAT). The 1-parameter logistic model (1-PLM) is usually assumed to require a smaller sample size than the 3-parameter logistic model (3-PLM) for item parameter calibrations. This study examined the effects of model misspecification on the precision of the decisions made using the sequential probability ratio test. For this comparison, the 1-PLM was used to estimate item parameters, even though the items{\textquoteright} characteristics were represented by a 3-PLM. Results demonstrate that the 1-PLM produced considerably more decision errors under simulation conditions similar to a real testing environment, compared to the true model and to a fixed-form standard reference set of items. (PsycINFO Database Record (c) 2003 APA, all rights reserved).}, keywords = {computerized adaptive testing}, author = {Kalohn, J.C. and Spray, J. A.} } @article {6, title = {Graphical models and computerized adaptive testing}, journal = {Applied Psychological Measurement}, volume = {23}, number = {3}, year = {1999}, pages = {223-37}, abstract = {Considers computerized adaptive testing from the perspective of graphical modeling (GM). GM provides methods for making inferences about multifaceted skills and knowledge and for extracting data from complex performances. Provides examples from language-proficiency assessment. (SLD)}, keywords = {computerized adaptive testing}, author = {Almond, R. G. and Mislevy, R. J.} } @book {110, title = {Innovations in computerized assessment}, year = {1999}, note = {EDRS Availability: None. Lawrence Erlbaum Associates, Inc., Publishers, 10 Industrial Avenue, Mahwah, New Jersey 07430-2262 (paperback: ISBN-0-8058-2877-X, $29.95; clothbound: ISBN-0-8058-2876-1, $59.95). Tel: 800-926-6579 (Toll Free).}, publisher = {Lawrence Erlbaum Associates, Inc.}, organization = {Lawrence Erlbaum Associates, Inc.}, address = {Mahwah, N.J.}, abstract = {Chapters in this book present the challenges and dilemmas faced by researchers as they created new computerized assessments, focusing on issues addressed in developing, scoring, and administering the assessments. Chapters are: (1) "Beyond Bells and Whistles; An Introduction to Computerized Assessment" (Julie B. Olson-Buchanan and Fritz Drasgow); (2) "The Development of a Computerized Selection System for Computer Programmers in a Financial Services Company" (Michael J. Zickar, Randall C. Overton, L. Rogers Taylor, and Harvey J. Harms); (3) "Development of the Computerized Adaptive Testing Version of the Armed Services Vocational Aptitude Battery" (Daniel O. Segall and Kathleen E. Moreno); (4) "CAT for Certification and Licensure" (Betty A. Bergstrom and Mary E. Lunz); (5) "Developing Computerized Adaptive Tests for School Children" (G. Gage Kingsbury and Ronald L. Houser); (6) "Development and Introduction of a Computer Adaptive Graduate Record Examinations General Test" (Craig N. Mills); (7) "Computer Assessment Using Visual Stimuli: A Test of Dermatological Skin Disorders" (Terry A. Ackerman, John Evans, Kwang-Seon Park, Claudia Tamassia, and Ronna Turner); (8) "Creating Computerized Adaptive Tests of Music Aptitude: Problems, Solutions, and Future Directions" (Walter P. Vispoel); (9) "Development of an Interactive Video Assessment: Trials and Tribulations" (Fritz Drasgow, Julie B. Olson-Buchanan, and Philip J. Moberg); (10) "Computerized Assessment of Skill for a Highly Technical Job" (Mary Ann Hanson, Walter C. Borman, Henry J. Mogilka, Carol Manning, and Jerry W. Hedge); (11) "Easing the Implementation of Behavioral Testing through Computerization" (Wayne A. Burroughs, Janet Murray, S. Scott Wesley, Debra R. Medina, Stacy L. Penn, Steven R. Gordon, and Michael Catello); and (12) "Blood, Sweat, and Tears: Some Final Comments on Computerized Assessment." (Fritz Drasgow and Julie B. Olson-Buchanan). Each chapter contains references. (Contains 17 tables and 21 figures.) (SLD)}, keywords = {computerized adaptive testing}, author = {F Drasgow and Olson-Buchanan, J. B.} } @article {395, title = {Multidimensional adaptive testing with a minimum error-variance criterion}, journal = {Journal of Educational and Behavioral Statistics}, volume = {24}, number = {4}, year = {1999}, pages = {398-412}, abstract = {Adaptive testing under a multidimensional logistic response model is addressed. An algorithm is proposed that minimizes the (asymptotic) variance of the maximum-likelihood estimator of a linear combination of abilities of interest. The criterion results in a closed-form expression that is easy to evaluate. In addition, it is shown how the algorithm can be modified if the interest is in a test with a "simple ability structure". The statistical properties of the adaptive ML estimator are demonstrated for a two-dimensional item pool with several linear combinations of the abilities. }, keywords = {computerized adaptive testing}, author = {van der Linden, W. J.} } @article {45, title = {Optimal design for item calibration in computerized adaptive testing}, journal = {Dissertation Abstracts International: Section B: the Sciences \& Engineering}, volume = {59}, number = {8-B}, year = {1999}, pages = {4220}, abstract = {Item Response Theory is the psychometric model used for standardized tests such as the Graduate Record Examination. A test-taker{\textquoteright}s response to an item is modelled as a binary response with success probability depending on parameters for both the test-taker and the item. Two popular models are the two-parameter logistic (2PL) model and the three-parameter logistic (3PL) model. For the 2PL model, the logit of the probability of a correct response equals ai(theta j-bi), where ai and bi are item parameters, while thetaj is the test-taker{\textquoteright}s parameter, known as "proficiency." The 3PL model adds a nonzero left asymptote to model random response behavior by low theta test-takers. Assigning scores to students requires accurate estimation of theta s, while accurate estimation of theta s requires accurate estimation of the item parameters. The operational implementation of Item Response Theory, particularly following the advent of computerized adaptive testing, generally involves handling these two estimation problems separately. This dissertation addresses the optimal design for item parameter estimation. Most current designs calibrate items with a sample drawn from the overall test-taking population. For 2PL models a sequential design based on the D-optimality criterion has been proposed, while no 3PL design is in the literature. In this dissertation, we design the calibration with the ultimate use of the items in mind, namely to estimate test-takers{\textquoteright} proficiency parameters. For both the 2PL and 3PL models, this criterion leads to a locally L-optimal design criterion, named the Minimal Information Loss criterion. In turn, this criterion and the General Equivalence Theorem give a two point design for the 2PL model and a three point design for the 3PL model. A sequential implementation of this optimal design is presented. For the 2PL model, this design is almost 55\% more efficient than the simple random sample approach, and 12\% more efficient than the locally D-optimal design. For the 3PL model, the proposed design is 34\% more efficient than the simple random sample approach. (PsycINFO Database Record (c) 2003 APA, all rights reserved).}, keywords = {computerized adaptive testing}, author = {Buyske, S. G.} } @article {406, title = {Using response-time constraints to control for differential speededness in computerized adaptive testing}, journal = {Applied Psychological Measurement}, volume = {23}, number = {3}, year = {1999}, note = {Sage Publications, US}, pages = {195-210}, abstract = {An item-selection algorithm is proposed for neutralizing the differential effects of time limits on computerized adaptive test scores. The method is based on a statistical model for distributions of examinees{\textquoteright} response times on items in a bank that is updated each time an item is administered. Predictions from the model are used as constraints in a 0-1 linear programming model for constrained adaptive testing that maximizes the accuracy of the trait estimator. The method is demonstrated empirically using an item bank from the Armed Services Vocational Aptitude Battery. }, keywords = {computerized adaptive testing}, author = {van der Linden, W. J. and Scrams, D. J. and Schnipke, D. L.} } @article {81, title = {Applications of network flows to computerized adaptive testing}, journal = {Dissertation Abstracts International: Section B: the Sciences \& Engineering}, volume = {59}, number = {2-B}, year = {1998}, pages = {0855}, abstract = {Recently, the concept of Computerized Adaptive Testing (CAT) has been receiving ever growing attention from the academic community. This is so because of both practical and theoretical considerations. Its practical importance lies in the advantages of CAT over the traditional (perhaps outdated) paper-and-pencil test in terms of time, accuracy, and money. The theoretical interest is sparked by its natural relationship to Item Response Theory (IRT). This dissertation offers a mathematical programming approach which creates a model that generates a CAT that takes care of many questions concerning the test, such as feasibility, accuracy and time of testing, as well as item pool security. The CAT generated is designed to obtain the most information about a single test taker. Several methods for eatimating the examinee{\textquoteright}s ability, based on the (dichotomous) responses to the items in the test, are also offered here. (PsycINFO Database Record (c) 2003 APA, all rights reserved).}, keywords = {computerized adaptive testing}, author = {Claudio, M. J. C.} } @article {177, title = {The effect of item pool restriction on the precision of ability measurement for a Rasch-based CAT: comparisons to traditional fixed length examinations}, journal = {J Outcome Meas}, volume = {2}, number = {2}, year = {1998}, note = {983263801090-655xJournal Article}, pages = {97-122}, abstract = {This paper describes a method for examining the precision of a computerized adaptive test with a limited item pool. Standard errors of measurement ascertained in the testing of simulees with a CAT using a restricted pool were compared to the results obtained in a live paper-and-pencil achievement testing of 4494 nursing students on four versions of an examination of calculations of drug administration. CAT measures of precision were considered when the simulated examine pools were uniform and normal. Precision indices were also considered in terms of the number of CAT items required to reach the precision of the traditional tests. Results suggest that regardless of the size of the item pool, CAT provides greater precision in measurement with a smaller number of items administered even when the choice of items is limited but fails to achieve equiprecision along the entire ability continuum.}, keywords = {*Decision Making, Computer-Assisted, Comparative Study, Computer Simulation, Education, Nursing, Educational Measurement/*methods, Human, Models, Statistical, Psychometrics/*methods}, author = {Halkitis, P. N.} } @article {261, title = {Maintaining content validity in computerized adaptive testing}, journal = {Advances in Health Sciences Education}, volume = {3}, number = {1}, year = {1998}, note = {Kluwer Academic Publishers, Netherlands}, pages = {29-41}, abstract = {The authors empirically demonstrate some of the trade-offs which can occur when content balancing is imposed in computerized adaptive testing (CAT) forms or conversely, when it is ignored. The authors contend that the content validity of a CAT form can actually change across a score scale when content balancing is ignored. However they caution that, efficiency and score precision can be severely reduced by over specifying content restrictions in a CAT form. The results from 2 simulation studies are presented as a means of highlighting some of the trade-offs that could occur between content and statistical considerations in CAT form assembly. (PsycINFO Database Record (c) 2003 APA, all rights reserved).}, keywords = {computerized adaptive testing}, author = {Luecht, RM and de Champlain, A. and Nungester, R. J.} } @article {405, title = {A model for optimal constrained adaptive testing}, journal = {Applied Psychological Measurement}, volume = {22}, number = {3}, year = {1998}, note = {Sage Publications, US}, pages = {259-270}, abstract = {A model for constrained computerized adaptive testing is proposed in which the information in the test at the trait level (0) estimate is maximized subject to a number of possible constraints on the content of the test. At each item-selection step, a full test is assembled to have maximum information at the current 0 estimate, fixing the items already administered. Then the item with maximum in-formation is selected. All test assembly is optimal because a linear programming (LP) model is used that automatically updates to allow for the attributes of the items already administered and the new value of the 0 estimator. The LP model also guarantees that each adaptive test always meets the entire set of constraints. A simulation study using a bank of 753 items from the Law School Admission Test showed that the 0 estimator for adaptive tests of realistic lengths did not suffer any loss of efficiency from the presence of 433 constraints on the item selection process. }, keywords = {computerized adaptive testing}, author = {van der Linden, W. J. and Reese, L. M.} } @article {373, title = {Simulating the use of disclosed items in computerized adaptive testing}, journal = {Journal of Educational Measurement}, volume = {35}, number = {1}, year = {1998}, note = {National Council on Measurement in Education, US}, pages = {48-68}, abstract = {Regular use of questions previously made available to the public (i.e., disclosed items) may provide one way to meet the requirement for large numbers of questions in a continuous testing environment, that is, an environment in which testing is offered at test taker convenience throughout the year rather than on a few prespecified test dates. First it must be shown that such use has effects on test scores small enough to be acceptable. In this study simulations are used to explore the use of disclosed items under a worst-case scenario which assumes that disclosed items are always answered correctly. Some item pool and test designs were identified in which the use of disclosed items produces effects on test scores that may be viewed as negligible.}, keywords = {computerized adaptive testing}, author = {Stocking, M. L. and W. C. Ward and Potenza, M. T.} } @article {66, title = {A comparison of maximum likelihood estimation and expected a posteriori estimation in computerized adaptive testing using the generalized partial credit model}, journal = {Dissertation Abstracts International: Section B: the Sciences \& Engineering}, volume = {58}, number = {1-B}, year = {1997}, pages = {453}, abstract = {A simulation study was conducted to investigate the application of expected a posteriori (EAP) trait estimation in computerized adaptive tests (CAT) based on the generalized partial credit model (Muraki, 1992), and to compare the performance of EAP with maximum likelihood trait estimation (MLE). The performance of EAP was evaluated under different conditions: the number of quadrature points (10, 20, and 30), and the type of prior distribution (normal, uniform, negatively skewed, and positively skewed). The relative performance of the MLE and EAP estimation methods were assessed under two distributional forms of the latent trait, one normal and the other negatively skewed. Also, both the known item parameters and estimated item parameters were employed in the simulation study. Descriptive statistics, correlations, scattergrams, accuracy indices, and audit trails were used to compare the different methods of trait estimation in CAT. The results showed that, regardless of the latent trait distribution, MLE and EAP with a normal prior, a uniform prior, or the prior that matches the latent trait distribution using either 20 or 30 quadrature points provided relatively accurate estimation in CAT based on the generalized partial credit model. However, EAP using only 10 quadrature points did not work well in the generalized partial credit CAT. Also, the study found that increasing the number of quadrature points from 20 to 30 did not increase the accuracy of EAP estimation. Therefore, it appears 20 or more quadrature points are sufficient for accurate EAP estimation. The results also showed that EAP with a negatively skewed prior and positively skewed prior performed poorly for the normal data set, and EAP with positively skewed prior did not provide accurate estimates for the negatively skewed data set. Furthermore, trait estimation in CAT using estimated item parameters produced results similar to those obtained using known item parameters. In general, when at least 20 quadrature points are used, EAP estimation with a normal prior, a uniform prior or the prior that matches the latent trait distribution appears to be a good alternative to MLE in the application of polytomous CAT based on the generalized partial credit model. (PsycINFO Database Record (c) 2003 APA, all rights reserved).}, keywords = {computerized adaptive testing}, author = {Chen, S-K.} } @book {347, title = {Computerized adaptive testing: From inquiry to operation}, year = {1997}, note = {References .Using Smart Source Parsingxvii, pp}, publisher = {American Psychological Association}, organization = {American Psychological Association}, address = {Washington, D.C., USA}, abstract = {(from the cover) This book traces the development of computerized adaptive testing (CAT) from its origins in the 1960s to its integration with the Armed Services Vocational Aptitude Battery (ASVAB) in the 1990s. A paper-and-pencil version of the battery (P\&P-ASVAB) has been used by the Defense Department since the 1970s to measure the abilities of applicants for military service. The test scores are used both for initial qualification and for classification into entry-level training opportunities. /// This volume provides the developmental history of the CAT-ASVAB through its various stages in the Joint-Service arena. Although the majority of the book concerns the myriad technical issues that were identified and resolved, information is provided on various political and funding support challenges that were successfully overcome in developing, testing, and implementing the battery into one of the nation{\textquoteright}s largest testing programs. The book provides useful information to professionals in the testing community and everyone interested in personnel assessment and evaluation. (PsycINFO Database Record (c) 2004 APA, all rights reserved).}, keywords = {computerized adaptive testing}, author = {Sands, W. A. and B. K. Waters and J. R. McBride} } @article {292, title = {The distribution of indexes of person fit within the computerized adaptive testing environment}, journal = {Applied Psychological Measurement}, volume = {21}, number = {2}, year = {1997}, note = {Journal; Peer Reviewed Journal}, pages = {115-127}, abstract = {The extent to which a trait estimate represents the underlying latent trait of interest can be estimated by using indexes of person fit. Several statistical methods for indexing person fit have been proposed to identify nonmodel-fitting response vectors. These person-fit indexes have generally been found to follow a standard normal distribution for conventionally administered tests. The present investigation found that within the context of computerized adaptive testing (CAT) these indexes tended not to follow a standard normal distribution. As the item pool became less discriminating, as the CAT termination criterion became less stringent, and as the number of items in the pool decreased, the distributions of the indexes approached a standard normal distribution. It was determined that under these conditions the indexes{\textquoteright} distributions approached standard normal distributions because more items were being administered. However, even when over 50 items were administered in a CAT the indexes were distributed in a fashion that was different from what was expected. (PsycINFO Database Record (c) 2006 APA )}, keywords = {Adaptive Testing, Computer Assisted Testing, Fit, Person Environment}, author = {Nering, M. L.} } @article {67, title = {The effect of population distribution and method of theta estimation on computerized adaptive testing (CAT) using the rating scale model}, journal = {Educational \& Psychological Measurement}, volume = {57}, number = {3}, year = {1997}, note = {Sage Publications, US}, pages = {422-439}, abstract = {Investigated the effect of population distribution on maximum likelihood estimation (MLE) and expected a posteriori estimation (EAP) in a simulation study of computerized adaptive testing (CAT) based on D. Andrich{\textquoteright}s (1978) rating scale model. Comparisons were made among MLE and EAP with a normal prior distribution and EAP with a uniform prior distribution within 2 data sets: one generated using a normal trait distribution and the other using a negatively skewed trait distribution. Descriptive statistics, correlations, scattergrams, and accuracy indices were used to compare the different methods of trait estimation. The EAP estimation with a normal prior or uniform prior yielded results similar to those obtained with MLE, even though the prior did not match the underlying trait distribution. An additional simulation study based on real data suggested that more work is needed to determine the optimal number of quadrature points for EAP in CAT based on the rating scale model. The choice between MLE and EAP for particular measurement situations is discussed. (PsycINFO Database Record (c) 2003 APA, all rights reserved).}, keywords = {computerized adaptive testing}, author = {Chen, S-K. and Hou, L. Y. and Fitzpatrick, S. J. and Dodd, B. G.} } @inbook {270, title = {Research antecedents of applied adaptive testing}, booktitle = {Computerized adaptive testing: From inquiry to practice}, year = {1997}, pages = {47-57}, publisher = {American Psychological Association}, organization = {American Psychological Association}, edition = {xviii}, address = {Washington D.C. USA}, abstract = {(from the chapter) This chapter sets the stage for the entire computerized adaptive testing Armed Services Vocational Aptitude Battery (CAT-ASVAB) development program by describing the state of the art immediately preceding its inception. By the mid-l970s, a great deal of research had been conducted that provided the technical underpinnings needed to develop adaptive tests, but little research had been done to corroborate empirically the promising results of theoretical analyses and computer simulation studies. In this chapter, the author summarizes much of the important theoretical and simulation research prior to 1977. In doing so, he describes a variety of approaches to adaptive testing, and shows that while many methods for adaptive testing had been proposed, few practical attempts had been made to implement it. Furthermore, the few instances of adaptive testing were based primarily on traditional test theory, and were developed in laboratory settings for purposes of basic research. The most promising approaches, those based on item response theory and evaluated analytically or by means of computer simulations, remained to be proven in the crucible of live testing. (PsycINFO Database Record (c) 2004 APA, all rights reserved).}, keywords = {computerized adaptive testing}, author = {J. R. McBride}, editor = {B. K. Waters and J. R. McBride} } @article {371, title = {Revising item responses in computerized adaptive tests: A comparison of three models}, journal = {Applied Psychological Measurement}, volume = {21}, number = {2}, year = {1997}, note = {Sage Publications, US}, pages = {129-142}, abstract = {Interest in the application of large-scale computerized adaptive testing has focused attention on issues that arise when theoretical advances are made operational. One such issue is that of the order in which exaniinees address questions within a test or separately timed test section. In linear testing, this order is entirely under the control of the examinee, who can look ahead at questions and return and revise answers to questions. Using simulation, this study investigated three models that permit restricted examinee control over revising previous answers in the context of adaptive testing. Even under a worstcase model of examinee revision behavior, two of the models of permitting item revisions worked well in preserving test fairness and accuracy. One model studied may also preserve some cognitive processing styles developed by examinees for a linear testing environment. }, keywords = {computerized adaptive testing}, author = {Stocking, M. L.} } @conference {290, title = {Validation of CATSIB To investigate DIF of CAT data}, booktitle = {annual meeting of the American Educational Research Association}, year = {1997}, address = {Chicago, IL. USA}, abstract = {This paper investigates the performance of CATSIB (a modified version of the SIBTEST computer program) to assess differential item functioning (DIF) in the context of computerized adaptive testing (CAT). One of the distinguishing features of CATSIB is its theoretically built-in regression correction to control for the Type I error rates when the distributions of the reference and focal groups differ on the intended ability. This phenomenon is also called impact. The Type I error rate of CATSIB with the regression correction (WRC) was compared with that of CATSIB without the regression correction (WORC) to see if the regression correction was indeed effective. Also of interest was the power level of CATSIB after the regression correction. The subtest size was set at 25 items, and sample size, the impact level, and the amount of DIF were varied. Results show that the regression correction was very useful in controlling for the Type I error, CATSIB WORC had inflated observed Type I errors, especially when impact levels were high. The CATSIB WRC had observed Type I error rates very close to the nominal level of 0.05. The power rates of CATSIB WRC were impressive. As expected, the power increased as the sample size increased and as the amount of DIF increased. Even for small samples with high impact rates, power rates were 64\% or higher for high DIF levels. For large samples, power rates were over 90\% for high DIF levels. (Contains 12 tables and 7 references.) (Author/SLD)}, keywords = {computerized adaptive testing}, author = {Nandakumar, R. and Roussos, L. A.} } @conference {382, title = {A comparison of the traditional maximum information method and the global information method in CAT item selection}, booktitle = {annual meeting of the National Council on Measurement in Education}, year = {1996}, address = {New York, NY USA}, keywords = {computerized adaptive testing, item selection}, author = {Tang, K. L.} } @article {23, title = {Dynamic scaling: An ipsative procedure using techniques from computer adaptive testing}, journal = {Dissertation Abstracts International: Section B: the Sciences \& Engineering}, volume = {56}, number = {10-B}, year = {1996}, pages = {5824}, abstract = {The purpose of this study was to create a prototype method for scaling items using computer adaptive testing techniques and to demonstrate the method with a working model program. The method can be used to scale items, rank individuals with respect to the scaled items, and to re-scale the items with respect to the individuals{\textquoteright} responses. When using this prototype method, the items to be scaled are part of a database that contains not only the items, but measures of how individuals respond to each item. After completion of all presented items, the individual is assigned an overall scale value which is then compared with each item responded to, and an individual "error" term is stored with each item. After several individuals have responded to the items, the item error terms are used to revise the placement of the scaled items. This revision feature allows the natural adaptation of one general list to reflect subgroup differences, for example, differences among geographic areas or ethnic groups. It also provides easy revision and limited authoring of the scale items by the computer program administrator. This study addressed the methodology, the instrumentation needed to handle the scale-item administration, data recording, item error analysis, and scale-item database editing required by the method, and the behavior of a prototype vocabulary test in use. Analyses were made of item ordering, response profiles, item stability, reliability and validity. Although slow, the movement of unordered words used as items in the prototype program was accurate as determined by comparison with an expert word ranking. Person scores obtained by multiple administrations of the prototype test were reliable and correlated at.94 with a commercial paper-and-pencil vocabulary test, while holding a three-to-one speed advantage in administration. Although based upon self-report data, dynamic scaling instruments like the model vocabulary test could be very useful for self-assessment, for pre (PsycINFO Database Record (c) 2003 APA, all rights reserved).}, keywords = {computerized adaptive testing}, author = {Berg, S. R.} } @article {149, title = {The effect of individual differences variables on the assessment of ability for Computerized Adaptive Testing}, journal = {Dissertation Abstracts International: Section B: the Sciences \& Engineering}, volume = {57}, number = {6-B}, year = {1996}, pages = {4085}, abstract = {Computerized Adaptive Testing (CAT) continues to gain momentum as the accepted testing modality for a growing number of certification, licensure, education, government and human resource applications. However, the developers of these tests have for the most part failed to adequately explore the impact of individual differences such as test anxiety on the adaptive testing process. It is widely accepted that non-cognitive individual differences variables interact with the assessment of ability when using written examinations. Logic would dictate that individual differences variables would equally affect CAT. Two studies were used to explore this premise. In the first study, 507 examinees were given a test anxiety survey prior to taking a high stakes certification exam using CAT or using a written format. All examinees had already completed their course of study, and the examination would be their last hurdle prior to being awarded certification. High test anxious examinees performed worse than their low anxious counterparts on both testing formats. The second study replicated the finding that anxiety depresses performance in CAT. It also addressed the differential effect of anxiety on within test performance. Examinees were candidates taking their final certification examination following a four year college program. Ability measures were calculated for each successive part of the test for 923 subjects. Within subject performance varied depending upon test position. High anxious examinees performed poorly at all points in the test, while low and medium anxious examinee performance peaked in the middle of the test. If test anxiety and performance measures were actually the same trait, then low anxious individuals should have performed equally well throughout the test. The observed interaction of test anxiety and time on task serves as strong evidence that test anxiety has motivationally mediated as well as cognitively mediated effects. The results of the studies are di (PsycINFO Database Record (c) 2003 APA, all rights reserved).}, keywords = {computerized adaptive testing}, author = {Gershon, R. C.} } @article {137, title = {Methodologic trends in the healthcare professions: computer adaptive and computer simulation testing}, journal = {Nurse Education}, volume = {21}, number = {4}, year = {1996}, note = {Forker, J EMcDonald, M EUnited statesNurse educatorNurse Educ. 1996 Jul-Aug;21(4):13-4.}, month = {Jul-Aug}, pages = {13-4}, edition = {1996/07/01}, abstract = {Assessing knowledge and performance on computer is rapidly becoming a common phenomenon in testing and measurement. Computer adaptive testing presents an individualized test format in accordance with the examinee{\textquoteright}s ability level. The efficiency of the testing process enables a more precise estimate of performance, often with fewer items than traditional paper-and-pencil testing methodologies. Computer simulation testing involves performance-based, or authentic, assessment of the examinee{\textquoteright}s clinical decision-making abilities. The authors discuss the trends in assessing performance through computerized means and the application of these methodologies to community-based nursing practice.}, keywords = {*Clinical Competence, *Computer Simulation, Computer-Assisted Instruction/*methods, Educational Measurement/*methods, Humans}, isbn = {0363-3624 (Print)0363-3624 (Linking)}, author = {Forker, J. E. and McDonald, M. E.} } @article {259, title = {Multidimensional computerized adaptive testing in a certification or licensure context}, journal = {Applied Psychological Measurement}, volume = {20}, number = {4}, year = {1996}, note = {Sage Publications, US}, pages = {389-404}, abstract = {(from the journal abstract) Multidimensional item response theory (MIRT) computerized adaptive testing, building on a recent work by D. O. Segall (1996), is applied in a licensing/certification context. An example of a medical licensure test is used to demonstrate situations in which complex, integrated content must be balanced at the total test level for validity reasons, but items assigned to reportable subscore categories may be used under a MIRT adaptive paradigm to improve the reliability of the subscores. A heuristic optimization framework is outlined that generalizes to both univariate and multivariate statistical objective functions, with additional systems of constraints included to manage the content balancing or other test specifications on adaptively constructed test forms. Simulation results suggested that a multivariate treatment of the problem, although complicating somewhat the objective function used and the estimation of traits, nonetheless produces advantages from a psychometric perspective. (PsycINFO Database Record (c) 2003 APA, all rights reserved).}, keywords = {computerized adaptive testing}, author = {Luecht, RM} } @article {122, title = {Assessment of scaled score consistency in adaptive testing from a multidimensional item response theory perspective}, journal = {Dissertation Abstracts International: Section B: the Sciences \& Engineering}, volume = {55}, number = {12-B}, year = {1995}, pages = {5598}, abstract = {The purpose of this study was twofold: (a) to examine whether the unidimensional adaptive testing estimates are comparable for different ability levels of examinees when the true examinee-item interaction is correctly modeled using a compensatory multidimensional item response theory (MIRT) model; and (b) to investigate the effects of adaptive testing estimation when the procedure of item selection of computerized adaptive testing (CAT) is controlled by either content-balancing or selecting the most informative item in a user specified direction at the current estimate of unidimensional ability. A series of Monte Carlo simulations were conducted in this study. Deviation from the reference composite angle was used as an index of the theta1,theta2-composite consistency across the different levels of unidimensional CAT estimates. In addition, the effect of the content-balancing item selection procedure and the fixed-direction item selection procedure were compared across the different ability levels. The characteristics of item selection, test information and the relationship between unidimensional and multidimensional models were also investigated. In addition to employing statistical analysis to examine the robustness of the CAT procedure violations of unidimensionality, this research also included graphical analyses to present the results. The results were summarized as follows: (a) the reference angles for the no-control-item-selection method were disparate across the unidimensional ability groups; (b) the unidimensional CAT estimates from the content-balancing item selection method did not offer much improvement; (c) the fixed-direction-item selection method did provide greater consistency for the unidimensional CAT estimates across the different levels of ability; (d) and, increasing the CAT test length did not provide greater score scale consistency. Based on the results of this study, the following conclusions were drawn: (a) without any controlling (PsycINFO Database Record (c) 2003 APA, all rights reserved).}, keywords = {computerized adaptive testing}, author = {Fan, Miechu} } @inbook {25, title = {The equivalence of Rasch item calibrations and ability estimates across modes of administration}, booktitle = {Objective measurement: Theory into practice}, volume = {2}, year = {1994}, pages = {122-128}, publisher = {Ablex Publishing Co.}, organization = {Ablex Publishing Co.}, address = {Norwood, N.J. USA}, keywords = {computerized adaptive testing}, author = {Bergstrom, Betty A. and Lunz, M. E.} } @article {221, title = {Monte Carlo simulation comparison of two-stage testing and computerized adaptive testing}, journal = {Dissertation Abstracts International Section A: Humanities \& Social Sciences}, volume = {54}, number = {7-A}, year = {1994}, pages = {2548}, keywords = {computerized adaptive testing}, author = {Kim, H-O.} } @article {284, title = {An application of Computerized Adaptive Testing to the Test of English as a Foreign Language}, journal = {Dissertation Abstracts International}, volume = {53}, number = {12-A}, year = {1993}, pages = {4257-4258}, keywords = {computerized adaptive testing}, author = {Moon, O.} } @article {224, title = {Assessing the utility of item response models: computerized adaptive testing}, journal = {Educational Measurement: Issues and Practice}, volume = {12}, number = {1}, year = {1993}, pages = {21-27}, keywords = {computerized adaptive testing}, author = {Kingsbury, G. G. and Houser, R.L.} } @article {339, title = {Comparability and validity of computerized adaptive testing with the MMPI-2}, journal = {Dissertation Abstracts International}, volume = {53}, number = {7-B}, year = {1993}, pages = {3791}, keywords = {computerized adaptive testing}, author = {Roper, B. L.} } @article {47, title = {Computer adaptive testing: A comparison of four item selection strategies when used with the golden section search strategy for estimating ability}, journal = {Dissertation Abstracts International}, volume = {54}, number = {5-A}, year = {1993}, pages = {1772}, keywords = {computerized adaptive testing}, author = {Carlson, R. D.} } @article {27, title = {Altering the level of difficulty in computer adaptive testing}, journal = {Applied Measurement in Education}, volume = {5}, number = {2}, year = {1992}, note = {Lawrence Erlbaum, US}, pages = {137-149}, abstract = {Examines the effect of altering test difficulty on examinee ability measures and test length in a computer adaptive test. The 225 Ss were randomly assigned to 3 test difficulty conditions and given a variable length computer adaptive test. Examinees in the hard, medium, and easy test condition took a test targeted at the 50\%, 60\%, or 70\% probability of correct response. The results show that altering the probability of a correct response does not affect estimation of examinee ability and that taking an easier computer adaptive test only slightly increases the number of items necessary to reach specified levels of precision. (PsycINFO Database Record (c) 2002 APA, all rights reserved).}, keywords = {computerized adaptive testing}, author = {Bergstrom, Betty A. and Lunz, M. E. and Gershon, R. C.} } @article {100, title = {The development and evaluation of a system for computerized adaptive testing}, journal = {Dissertation Abstracts International}, volume = {52}, number = {12-A}, year = {1992}, pages = {4304}, keywords = {computerized adaptive testing}, author = {de la Torre Sanchez, R.} } @article {318, title = {Test anxiety and test performance under computerized adaptive testing methods}, journal = {Dissertation Abstracts International}, volume = {52}, number = {7-A}, year = {1992}, pages = {2518}, keywords = {computerized adaptive testing}, author = {Powell, Zen-Hsiu E.} } @article {235, title = {A comparison of paper-and-pencil, computer-administered, computerized feedback, and computerized adaptive testing methods for classroom achievement testing}, journal = {Dissertation Abstracts International}, volume = {52}, number = {5-A}, year = {1991}, pages = {1719}, keywords = {computerized adaptive testing}, author = {Kuan, Tsung Hao} } @article {62, title = {Inter-subtest branching in computerized adaptive testing}, journal = {Dissertation Abstracts International}, volume = {52}, number = {1-A}, year = {1991}, pages = {140-141}, keywords = {computerized adaptive testing}, author = {Chang, S-H.} } @booklet {337, title = {Patterns of alcohol and drug use among federal offenders as assessed by the Computerized Lifestyle Screening Instrument}, number = {R-11}, year = {1991}, publisher = {Research and Statistics Branch, Correctional Service of Canada}, address = {Ottawa, ON. Canada}, keywords = {computerized adaptive testing, drug abuse, substance use}, isbn = {R-11}, author = {Robinson, D. and Porporino, F. J. and Millson, W. A.} } @article {98, title = {A simulation and comparison of flexilevel and Bayesian computerized adaptive testing}, journal = {Journal of Educational Measurement}, volume = {27}, number = {3}, year = {1990}, pages = {227-239}, abstract = {Computerized adaptive testing (CAT) is a testing procedure that adapts an examination to an examinee{\textquoteright}s ability by administering only items of appropriate difficulty for the examinee. In this study, the authors compared Lord{\textquoteright}s flexilevel testing procedure (flexilevel CAT) with an item response theory-based CAT using Bayesian estimation of ability (Bayesian CAT). Three flexilevel CATs, which differed in test length (36, 18, and 11 items), and three Bayesian CATs were simulated; the Bayesian CATs differed from one another in the standard error of estimate (SEE) used for terminating the test (0.25, 0.10, and 0.05). Results showed that the flexilevel 36- and 18-item CATs produced ability estimates that may be considered as accurate as those of the Bayesian CAT with SEE = 0.10 and comparable to the Bayesian CAT with SEE = 0.05. The authors discuss the implications for classroom testing and for item response theory-based CAT.}, keywords = {computerized adaptive testing}, author = {De Ayala, R. J., and Dodd, B. G. and Koch, W. R.} } @article {325, title = {Adaptive testing: The evolution of a good idea}, journal = {Educational Measurement: Issues and Practice}, volume = {8}, number = {3}, year = {1989}, pages = {11-15}, keywords = {computerized adaptive testing}, isbn = {1745-3992}, author = {Reckase, M. D.} } @article {209, title = {Application of computerized adaptive testing to the University Entrance Exam of Taiwan, R.O.C}, journal = {Dissertation Abstracts International}, volume = {49}, number = {12-A, Pt 1}, year = {1989}, pages = {3662}, keywords = {computerized adaptive testing}, author = {Hung, P-H.} } @mastersthesis {350, title = {An applied study on computerized adaptive testing}, year = {1989}, pages = {185}, school = {University of Groingen}, type = {Dissertation}, address = {Groningen, The Netherlands}, abstract = {(from the cover) The rapid development and falling prices of powerful personal computers, in combination with new test theories, will have a large impact on psychological testing. One of the new possibilities is computerized adaptive testing. During the test administration each item is chosen to be appropriate for the person being tested. The test becomes tailor-made, resolving some of the problems with classical paper-and-pencil tests. In this way individual differences can be measured with higher efficiency and reliability. Scores on other meaningful variables, such as response time, can be obtained easily using computers. /// In this book a study on computerized adaptive testing is described. The study took place at Dutch Railways in an applied setting and served practical goals. Topics discussed include the construction of computerized tests, the use of response time, the choice of algorithms and the implications of using a latent trait model. After running a number of simulations and calibrating the item banks, an experiment was carried out. In the experiment a pretest was administered to a sample of over 300 applicants, followed by an adaptive test. In addition, a survey concerning the attitudes of testees towards computerized testing formed part of the design.}, keywords = {computerized adaptive testing}, author = {Schoonman, W.} } @article {22, title = {A real-data simulation of computerized adaptive administration of the MMPI}, journal = {Psychological Assessment}, volume = {1}, number = {1}, year = {1989}, note = {Article}, pages = {18-22}, abstract = {A real-data simulation of computerized adaptive administration of the MMPI was conducted with data obtained from two personnel-selection samples and two clinical samples. A modification of the countdown method was tested to determine the usefulness, in terms of item administration savings, of several different test administration procedures. Substantial item administration savings were achieved for all four samples, though the clinical samples required administration of more items to achieve accurate classification and/or full-scale scores than did the personnel-selection samples. The use of normative item endorsement frequencies was found to be as effective as sample-specific frequencies for the determination of item administration order. The role of computerized adaptive testing in the future of personality assessment is discussed., (C) 1989 by the American Psychological Association}, keywords = {computerized adaptive testing}, author = {Ben-Porath, Y. S. and Slutske, W. S. and Butcher, J. N.} } @article {97, title = {Computerized adaptive testing: A comparison of the nominal response model and the three parameter model}, journal = {Dissertation Abstracts International}, volume = {48}, number = {10-B}, year = {1988}, pages = {3148}, keywords = {computerized adaptive testing}, author = {De Ayala, R. J.,} } @article {366, title = {The effect of item parameter estimation error on decisions made using the sequential probability ratio test}, number = {Research Report 87-1}, year = {1987}, institution = {DTIC Document}, address = {Iowa City, IA. USA}, keywords = {computerized adaptive testing, Sequential probability ratio test}, author = {Spray, J. A. and Reckase, M. D.} } @article {148, title = {An application of computer adaptive testing with communication handicapped examinees}, journal = {Educational and Psychological Measurement}, volume = {46}, number = {1}, year = {1986}, note = {Using Smart Source Parsingno. pp. MarchJournal Article10.1177/0013164486461003}, pages = {23-35}, abstract = {This study was conducted to evaluate a computerized adaptive testing procedure for the measurement of mathematical skills of entry level deaf college students. The theoretical basis of the study was the Rasch model for person measurement. Sixty persons were tested using an Apple II Plus microcomputer. Ability estimates provided by the computerized procedure were compared for stability with those obtained six to eight weeks earlier from conventional (written) testing of the same subject matter. Students{\textquoteright} attitudes toward their testing experiences also were measured. Substantial increases in measurement efficiency (by reducing test length) were realized through the adaptive testing procedure. Because the item pool used was not specifically designed for adaptive testing purposes, the psychometric quality of measurements resulting from the different testing methods was approximately equal. Attitudes toward computerized testing were favorable.}, keywords = {computerized adaptive testing}, isbn = {0013-1644}, author = {Garrison, W. M. and Baumgarten, B. S.} } @article {222, title = {Adaptive self-referenced testing as a procedure for the measurement of individual change due to instruction: A comparison of the reliabilities of change estimates obtained from conventional and adaptive testing procedures}, journal = {Dissertation Abstracts International}, volume = {45}, number = {9-B}, year = {1985}, pages = {3057}, keywords = {computerized adaptive testing}, author = {Kingsbury, G. G.} } @article {285, title = {Relationship between corresponding Armed Services Vocational Aptitude Battery (ASVAB) and computerized adaptive testing (CAT) subtests}, journal = {Applied Psychological Measurement}, volume = {8}, number = {2}, year = {1984}, note = {Sage Publications, US}, pages = {155-163}, abstract = {Investigated the relationships between selected subtests from the Armed Services Vocational Aptitude Battery (ASVAB) and corresponding subtests administered as computerized adaptive tests (CATs), using 270 17-26 yr old Marine recruits as Ss. Ss were administered the ASVAB before enlisting and approximately 2 wks after entering active duty, and the CAT tests were administered to Ss approximately 24 hrs after arriving at the recruit depot. Results indicate that 3 adaptive subtests correlated as well with ASVAB as did the 2nd administration of the ASVAB, although CAT subtests contained only half the number of items. Factor analysis showed CAT subtests to load on the same factors as the corresponding ASVAB subtests, indicating that the same abilities were being measured. It is concluded that CAT can achieve the same measurement precision as a conventional test, with half the number of items. (16 ref) }, keywords = {computerized adaptive testing}, author = {Moreno, K. E. and Wetzel, C. D. and J. R. McBride and Weiss, D. J.} } @article {160, title = {Technical guidelines for assessing computerized adaptive tests}, journal = {Journal of Educational Measurement}, volume = {21}, number = {4}, year = {1984}, pages = {347-360}, keywords = {computerized adaptive testing, Mode effects, paper-and-pencil}, isbn = {1745-3984}, author = {Green, B. F. and Bock, R. D. and Humphreys, L. G. and Linn, R. L. and Reckase, M. D.} } @inbook {324, title = {A procedure for decision making using tailored testing.}, booktitle = {New horizons in testing: Latent trait theory and computerized adaptive testing }, year = {1983}, pages = {237-254}, publisher = {Academic Press}, organization = {Academic Press}, address = {New York, NY. USA}, keywords = {CCAT, CLASSIFICATION Computerized Adaptive Testing, sequential probability ratio testing, SPRT}, author = {Reckase, M. D.} } @article {303, title = {Ability measurement, test bias reduction, and psychological reactions to testing as a function of computer adaptive testing versus conventional testing}, journal = {Dissertation Abstracts International}, volume = {42}, number = {10-B}, year = {1982}, pages = {4233}, keywords = {computerized adaptive testing}, author = {Orban, J. A.} } @article {256, title = {Sequential testing for dichotomous decisions. }, journal = {Educational and Psychological Measurement}, volume = {32}, number = {1}, year = {1972}, pages = {85-95.}, keywords = {CCAT, CLASSIFICATION Computerized Adaptive Testing, sequential probability ratio testing, SPRT}, author = {Linn, R. L. and Rock, D. A. and Cleary, T. A.} }